In [1]:
# 初期設定

!pip install scipy

sample_len = 10 # seconds
sampling_rate = 44100
data_len = sample_len * sampling_rate

Collecting scipy
  Downloading scipy-1.10.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (34.5 MB)
[K     |████████████████████████████████| 34.5 MB 14.0 MB/s eta 0:00:01
Installing collected packages: scipy
Successfully installed scipy-1.10.1
You should consider upgrading via the '/usr/bin/python3 -m pip install --upgrade pip' command.[0m


In [6]:
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import matplotlib.pyplot as plt

# サンプリングレイヤー

class Sampling(layers.Layer):
    def call(self, inputs):
        z_mean, z_log_var = inputs
        batch = tf.shape(z_mean)[0]
        dim = tf.shape(z_mean)[1]
        epsilon = tf.keras.backend.random_normal(shape=(batch, dim))
        return z_mean + tf.exp(0.5 * z_log_var) * epsilon

# エンコーダー    

latent_dim = 2

encoder_inputs = keras.Input(shape=(data_len, 1))
x = layers.Conv1D(32, 3, activation="relu", strides=2, padding="same")(encoder_inputs)
x = layers.Conv1D(64, 3, activation="relu", strides=2, padding="same")(x)
x = layers.Flatten()(x)
x = layers.Dense(16, activation="relu")(x)
z_mean = layers.Dense(latent_dim, name="z_mean")(x)
z_log_var = layers.Dense(latent_dim, name="z_log_var")(x)
z = Sampling()([z_mean, z_log_var])
encoder = keras.Model(encoder_inputs, [z_mean, z_log_var, z], name="encoder")
encoder.summary()

# デコーダー

latent_inputs = keras.Input(shape=(latent_dim,))
x = layers.Dense(data_len // 4 * 64, activation="relu")(latent_inputs)
x = layers.Reshape((data_len // 4, 64))(x)
x = layers.Conv1DTranspose(64, 3, activation="relu", strides=2, padding="same")(x)
x = layers.Conv1DTranspose(32, 3, activation="relu", strides=2, padding="same")(x)
decoder_outputs = layers.Conv1DTranspose(1, 3, activation="sigmoid", padding="same")(x)
decoder = keras.Model(latent_inputs, decoder_outputs, name="decoder")
decoder.summary()

# VAE

class VAE(keras.Model):
    def __init__(self, encoder, decoder, **kwargs):
        super(VAE, self).__init__(**kwargs)
        self.encoder = encoder
        self.decoder = decoder
        self.total_loss_tracker = keras.metrics.Mean(name="total_loss")
        self.reconstruction_loss_tracker = keras.metrics.Mean(name="reconstruction_loss")
        self.kl_loss_tracker = keras.metrics.Mean(name="kl_loss")

    @property
    def metrics(self):
        return [
            self.total_loss_tracker,
            self.reconstruction_loss_tracker,
            self.kl_loss_tracker,
        ]

    def predict(self, x):
        z_mean, _, _ = self.encoder.predict(x)
        y = self.decoder.predict(z_mean)
        return y

    def train_step(self, data):
        with tf.GradientTape() as tape:
            z_mean, z_log_var, z = self.encoder(data)
            reconstruction = self.decoder(z)
            reconstruction_loss = tf.reduce_mean(
                tf.reduce_sum(
                    keras.losses.binary_crossentropy(data, reconstruction), axis=(1, 2)
                )
            )
            kl_loss = -0.5 * (1 + z_log_var - tf.square(z_mean) - tf.exp(z_log_var))
            kl_loss = tf.reduce_mean(tf.reduce_sum(kl_loss, axis=1))
            total_loss = reconstruction_loss + kl_loss
        grads = tape.gradient(total_loss, self.trainable_weights)
        self.optimizer.apply_gradients(zip(grads, self.trainable_weights))
        self.total_loss_tracker.update_state(total_loss)
        self.reconstruction_loss_tracker.update_state(reconstruction_loss)
        self.kl_loss_tracker.update_state(kl_loss)
        return {
            "loss": self.total_loss_tracker.result(),
            "reconstruction_loss": self.reconstruction_loss_tracker.result(),
            "kl_loss": self.kl_loss_tracker.result(),
        }

Model: "encoder"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_3 (InputLayer)           [(None, 441000, 1)]  0           []                               
                                                                                                  
 conv1d_2 (Conv1D)              (None, 220500, 32)   128         ['input_3[0][0]']                
                                                                                                  
 conv1d_3 (Conv1D)              (None, 110250, 64)   6208        ['conv1d_2[0][0]']               
                                                                                                  
 flatten_1 (Flatten)            (None, 7056000)      0           ['conv1d_3[0][0]']               
                                                                                            

In [3]:
# データ作成

from scipy.io.wavfile import read
import glob

files = glob.glob("../data/wav44100/*")
raw_data_list = [read(file)[1] for file in files]
    
input_data_array = np.zeros((0, data_len), dtype=np.int16)
for raw_data in raw_data_list:
    tmp = [raw_data[i:i + data_len] for i in range(0, len(raw_data) - data_len, data_len)]
    input_data_array = np.vstack((input_data_array, np.array(tmp)))

input_data_array.shape

(4149, 441000)

In [4]:
# 学習データとテストデータに分ける

train_data_rate = 0.8
np.random.shuffle(input_data_array)

x_train = input_data_array[:int(input_data_array.shape[0] * train_data_rate), :]
x_test = input_data_array[int(input_data_array.shape[0] * train_data_rate):, :]

print(x_train.shape)
print(x_test.shape)

(3319, 441000)
(830, 441000)


In [None]:
# 学習

digits = np.concatenate([x_train, x_test], axis=0)
digits = np.expand_dims(digits, -1).astype("float32") / np.iinfo(np.int16).max

vae = VAE(encoder, decoder)
vae.compile(optimizer=keras.optimizers.Adam())
history = vae.fit(digits, epochs=3, batch_size=128)

vae.save_weights("vae")