# Data Processing Notebook:

In [4]:
import sys
sys.path.append('../')

from functions import processing_funcs, utils
from pydub.utils import mediainfo
import noisereduce

### Split stereo channels into mono:

In [2]:
%%time

processing_funcs.split_stereo_audio(voice_dir_path = '../voice_data/')

Collecting all files in ../voice_data/ matching regular expression [0-9]+\.wav.

Splitting file 1 of 31 at ../voice_data/4175.wav...

Splitting file 2 of 31 at ../voice_data/4504.wav...

Splitting file 3 of 31 at ../voice_data/4708.wav...

Splitting file 4 of 31 at ../voice_data/4745.wav...

Splitting file 5 of 31 at ../voice_data/4823.wav...

Splitting file 6 of 31 at ../voice_data/4874.wav...

Splitting file 7 of 31 at ../voice_data/4889.wav...

Splitting file 8 of 31 at ../voice_data/4984.wav...

Splitting file 9 of 31 at ../voice_data/5000.wav...

Splitting file 10 of 31 at ../voice_data/5051.wav...

Splitting file 11 of 31 at ../voice_data/5220.wav...

Splitting file 12 of 31 at ../voice_data/5635.wav...

Splitting file 13 of 31 at ../voice_data/5926.wav...

Splitting file 14 of 31 at ../voice_data/6015.wav...

Splitting file 15 of 31 at ../voice_data/6062.wav...

Splitting file 16 of 31 at ../voice_data/6065.wav...

Splitting file 17 of 31 at ../voice_data/6093.wav...

Splitting 

### All of our stereo audio should now be split into mono channels:

In [3]:
mono_dir_path = '../voice_data/mono_channels/'
mono_dir = utils.read_dir_files(dir_path = mono_dir_path,
                                file_regex = r'[0-9]+\_(?:left|right)\.wav')

print('Total number of mono files:', len(mono_dir), '\n')

for mono_file in mono_dir:
    print(mono_file)

Collecting all files in ../voice_data/mono_channels/ matching regular expression [0-9]+\_(?:left|right)\.wav.

Total number of mono files: 62 

4175_left.wav
4175_right.wav
4504_left.wav
4504_right.wav
4708_left.wav
4708_right.wav
4745_left.wav
4745_right.wav
4823_left.wav
4823_right.wav
4874_left.wav
4874_right.wav
4889_left.wav
4889_right.wav
4984_left.wav
4984_right.wav
5000_left.wav
5000_right.wav
5051_left.wav
5051_right.wav
5220_left.wav
5220_right.wav
5635_left.wav
5635_right.wav
5926_left.wav
5926_right.wav
6015_left.wav
6015_right.wav
6062_left.wav
6062_right.wav
6065_left.wav
6065_right.wav
6093_left.wav
6093_right.wav
6126_left.wav
6126_right.wav
6157_left.wav
6157_right.wav
6193_left.wav
6193_right.wav
6239_left.wav
6239_right.wav
6255_left.wav
6255_right.wav
6278_left.wav
6278_right.wav
6372_left.wav
6372_right.wav
6379_left.wav
6379_right.wav
6476_left.wav
6476_right.wav
6862_left.wav
6862_right.wav
6869_left.wav
6869_right.wav
6899_left.wav
6899_right.wav
6938_left.wav
6

### We now have 62 mono files: 2 * 31 original files. This makes sense!

Now to check to make sure they look okay:

In [4]:
for file in mono_dir:
    file_path = mono_dir_path + file
    file_info = mediainfo(file_path)
    print('File:', file_path, '|',
          'Number of Channels:', file_info['channels'], '|',
          'Sampling Rate:', file_info['sample_rate'], '|',
          'Duration (mins):', round((float(file_info['duration']) / 60), 2))

File: ../voice_data/mono_channels/4175_left.wav | Number of Channels: 1 | Sampling Rate: 8000 | Duration (mins): 30.0
File: ../voice_data/mono_channels/4175_right.wav | Number of Channels: 1 | Sampling Rate: 8000 | Duration (mins): 30.0
File: ../voice_data/mono_channels/4504_left.wav | Number of Channels: 1 | Sampling Rate: 8000 | Duration (mins): 7.85
File: ../voice_data/mono_channels/4504_right.wav | Number of Channels: 1 | Sampling Rate: 8000 | Duration (mins): 7.85
File: ../voice_data/mono_channels/4708_left.wav | Number of Channels: 1 | Sampling Rate: 8000 | Duration (mins): 30.0
File: ../voice_data/mono_channels/4708_right.wav | Number of Channels: 1 | Sampling Rate: 8000 | Duration (mins): 30.0
File: ../voice_data/mono_channels/4745_left.wav | Number of Channels: 1 | Sampling Rate: 8000 | Duration (mins): 23.76
File: ../voice_data/mono_channels/4745_right.wav | Number of Channels: 1 | Sampling Rate: 8000 | Duration (mins): 23.76
File: ../voice_data/mono_channels/4823_left.wav | 

### Channels and duration look good, and the native sampling rate was preserved. Perfect.

### Now to eliminate noise as much as possible from the mono files:

We'll use the 'noisereduce' package and see how that affects our data:

In [5]:
#noisereduce.reduce_noise

### Now to remove silence from the audio files:

In [2]:
%%time

processing_funcs.remove_silence(mono_channel_dir = '../voice_data/mono_channels/')

Collecting all files in ../voice_data/mono_channels/ matching regular expression [0-9]+\_(?:left|right)\.wav.

Removing periods of silence from file 1 of 62...

Removing periods of silence from file 2 of 62...

Removing periods of silence from file 3 of 62...

Removing periods of silence from file 4 of 62...

Removing periods of silence from file 5 of 62...

Removing periods of silence from file 6 of 62...

Removing periods of silence from file 7 of 62...

Removing periods of silence from file 8 of 62...

Removing periods of silence from file 9 of 62...

Removing periods of silence from file 10 of 62...

Removing periods of silence from file 11 of 62...

Removing periods of silence from file 12 of 62...

Removing periods of silence from file 13 of 62...

Removing periods of silence from file 14 of 62...

Removing periods of silence from file 15 of 62...

Removing periods of silence from file 16 of 62...

Removing periods of silence from file 17 of 62...

Removing periods of silence fro