# Data Exploration Notebook:

In [16]:
# import librosa
# import librosa.display
# import soundfile as sf
# import IPython.display as ipd
# import numpy as np
# import matplotlib.pyplot as plt

import os
import pydub
from pydub import AudioSegment
from pydub.utils import mediainfo
import regex as re
import librosa
import IPython.display as ipd

### Count files and show filepaths:

In [17]:
vd_dir = '../voice_data/'

In [18]:
# Go through our directory and get all of the file paths:
voice_files = [file for file in os.listdir(vd_dir) if re.match(r'[0-9]+\.wav', file)]

In [19]:
print('Number of files:', len(voice_files), '\n')
print(voice_files)

Number of files: 31 

['4175.wav', '4504.wav', '4708.wav', '4745.wav', '4823.wav', '4874.wav', '4889.wav', '4984.wav', '5000.wav', '5051.wav', '5220.wav', '5635.wav', '5926.wav', '6015.wav', '6062.wav', '6065.wav', '6093.wav', '6126.wav', '6157.wav', '6193.wav', '6239.wav', '6255.wav', '6278.wav', '6372.wav', '6379.wav', '6476.wav', '6862.wav', '6869.wav', '6899.wav', '6938.wav', '6952.wav']


#### We have 31 files in all, each labeled with a unique 4-digit number.

### Basic information for each file:

In [20]:
# Check basic info about each of the files using pydub's mediainfo:
for file in voice_files:
    file_path = vd_dir + file
    file_info = mediainfo(file_path)
    print('File:', file_path, '|',
          'Number of Channels:', file_info['channels'], '|',
          'Sampling Rate:', file_info['sample_rate'], '|',
          'Duration (mins):', round((float(file_info['duration']) / 60), 2))

File: ../voice_data/4175.wav | Number of Channels: 2 | Sampling Rate: 8000 | Duration (mins): 30.0
File: ../voice_data/4504.wav | Number of Channels: 2 | Sampling Rate: 8000 | Duration (mins): 7.85
File: ../voice_data/4708.wav | Number of Channels: 2 | Sampling Rate: 8000 | Duration (mins): 30.0
File: ../voice_data/4745.wav | Number of Channels: 2 | Sampling Rate: 8000 | Duration (mins): 23.76
File: ../voice_data/4823.wav | Number of Channels: 2 | Sampling Rate: 8000 | Duration (mins): 30.0
File: ../voice_data/4874.wav | Number of Channels: 2 | Sampling Rate: 8000 | Duration (mins): 30.0
File: ../voice_data/4889.wav | Number of Channels: 2 | Sampling Rate: 8000 | Duration (mins): 25.32
File: ../voice_data/4984.wav | Number of Channels: 2 | Sampling Rate: 8000 | Duration (mins): 30.0
File: ../voice_data/5000.wav | Number of Channels: 2 | Sampling Rate: 8000 | Duration (mins): 30.0
File: ../voice_data/5051.wav | Number of Channels: 2 | Sampling Rate: 8000 | Duration (mins): 26.33
File: .

#### All of our files have 2 channels (stereo) and were sampled at a rate of 8kHz

#### Some of our audio files are pretty short at 7.85 minutes while most are 30 minutes long. Hopefully we'll have enough data from each file to use them for our project.

### Create a test audio clip to explore:

In [21]:
test_fp = vd_dir + '4175.wav'
test_audio = AudioSegment.from_file(test_fp)

In [22]:
# Create one minute of sample audio:
clip_start = 120000
clip_end = 180000

test_audio_clip = test_audio[clip_start:clip_end]

In [23]:
# Save the test audio clip as a .wav file:
test_audio_clip.export(out_f = 'test_audio_clip.wav', format='wav')

<_io.BufferedRandom name='test_audio_clip.wav'>

In [24]:
test_clip_fp = 'test_audio_clip.wav'

In [25]:
# This will create an interactive button we can use to listen to the audio in the notebook!
ipd.Audio(test_clip_fp)

In [26]:
# librosa.load uses a default sampling rate of 22050, but we'll specify sr=None so that it preserves are native sampling rate (8kHz)
test, test_sr = librosa.load(test_clip_fp, sr=None)

print('Sampling rate =', test_sr)

Sampling rate = 8000
