In [1]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

In [2]:
import os

In [3]:
os.listdir('..')

['.git',
 '.gitignore',
 '16000_pcm_speeches',
 'auto',
 'data-processing',
 'design-pattern',
 'model',
 'README.md',
 'rs',
 'statstics',
 'web-scraping']

In [27]:
def read_audio_from_wav(path, is_print=False):
    sample, sampling_rate = tf.audio.decode_wav(
        contents=tf.io.read_file(path),
        desired_channels=1
    )
    if is_print:
        print(f'{path} shape: {sample.shape}')
        print(f'採樣率: {sampling_rate.numpy()}')
    return sample, sampling_rate

In [28]:
# 讀取wav檔案
wav_file = '../16000_pcm_speeches/Benjamin_Netanyau/1.wav'

sample, sampling_rate = tf.audio.decode_wav(contents=tf.io.read_file(wav_file), desired_channels=1)

In [29]:
sample

<tf.Tensor: shape=(16000, 1), dtype=float32, numpy=
array([[ 0.00396729],
       [-0.00119019],
       [-0.00973511],
       ...,
       [ 0.20730591],
       [ 0.18551636],
       [ 0.1104126 ]], dtype=float32)>

In [30]:
sampling_rate

<tf.Tensor: shape=(), dtype=int32, numpy=16000>

In [31]:
read_audio_from_wav('bird_sing.wav', True)

bird_sing.wav shape: (1140472, 1)
採樣率: 44100


(<tf.Tensor: shape=(1140472, 1), dtype=float32, numpy=
 array([[-3.0517578e-05],
        [-3.0517578e-05],
        [-3.0517578e-05],
        ...,
        [-3.0517578e-05],
        [-3.0517578e-05],
        [ 0.0000000e+00]], dtype=float32)>,
 <tf.Tensor: shape=(), dtype=int32, numpy=44100>)

In [32]:
# 秒數
1140472 // 44100

25

In [33]:
read_audio_from_wav(wav_file, True)

../16000_pcm_speeches/Benjamin_Netanyau/1.wav shape: (16000, 1)
採樣率: 16000


(<tf.Tensor: shape=(16000, 1), dtype=float32, numpy=
 array([[ 0.00396729],
        [-0.00119019],
        [-0.00973511],
        ...,
        [ 0.20730591],
        [ 0.18551636],
        [ 0.1104126 ]], dtype=float32)>,
 <tf.Tensor: shape=(), dtype=int32, numpy=16000>)

In [35]:
audio, _ = read_audio_from_wav('bird_sing.wav', True)
sample_rate = 16000

wav_string = tf.audio.encode_wav(
    audio, sample_rate, name='bird_sing_16000.wav'
)

bird_sing.wav shape: (1140472, 1)
採樣率: 44100


In [40]:
# 上面取得的wav_string可以透過decode去轉換成float
# 可以發現samling_rate=16000

tf.audio.decode_wav(wav_string)

DecodeWav(audio=<tf.Tensor: shape=(1140472, 1), dtype=float32, numpy=
array([[-3.0517578e-05],
       [-3.0517578e-05],
       [-3.0517578e-05],
       ...,
       [-3.0517578e-05],
       [-3.0517578e-05],
       [ 0.0000000e+00]], dtype=float32)>, sample_rate=<tf.Tensor: shape=(), dtype=int32, numpy=16000>)

In [45]:
# 轉換sampling rate

def tranform_sampling_rate(path, sampling_rate):
    audio, s = tf.audio.decode_wav(
        tf.io.read_file(path),
        desired_channels=1
    )
    wav_string = tf.audio.encode_wav(audio, sampling_rate)
    new_audio, new_s = tf.audio.decode_wav(
        wav_string
    )
    print(f'採樣率轉換: {s.numpy()} -> {new_s.numpy()}')

    return new_audio, new_s
    
    
tranform_sampling_rate('bird_sing.wav', 16000)

採樣率轉換: 44100 -> 16000


(<tf.Tensor: shape=(1140472, 1), dtype=float32, numpy=
 array([[-3.0517578e-05],
        [-3.0517578e-05],
        [-3.0517578e-05],
        ...,
        [-3.0517578e-05],
        [-3.0517578e-05],
        [ 0.0000000e+00]], dtype=float32)>,
 <tf.Tensor: shape=(), dtype=int32, numpy=16000>)