<a href="https://colab.research.google.com/github/EduardoIllanes/Reconstruccion-e-Interpolacion-de-Audio/blob/main/Tensorflow_Siren.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Reconstrucción de audio

In [None]:
!pip install librosa &> /dev/null

In [None]:
import tensorflow as tf
from tensorflow.keras.models import *
from tensorflow.keras.layers import *
import numpy as np
import scipy.io.wavfile as wavfile
import io
from IPython.display import Audio
import librosa

Creamos la arquitectura de nuestra SIRENs

In [None]:
class SineLayer(Layer):
  def __init__(self, out_features, is_first = False, bias = True, omega_0 = 30):
    super().__init__()
    self.out_features = out_features
    self.omega_0 = omega_0
    self.is_first = is_first
  
  def build(self, input_shape):
    if self.is_first:
      self.linear = Dense(self.out_features,
                        input_shape = input_shape,
                        kernel_initializer = tf.keras.initializers.RandomUniform(minval= -1 / input_shape[-1], 
                                                                                 maxval= 1 / input_shape[-1]))
    else:
      self.linear = Dense(self.out_features,
                        input_shape = input_shape,
                        kernel_initializer = tf.keras.initializers.RandomUniform(minval= -np.sqrt(6 / input_shape[-1]) / self.omega_0, 
                                                                                 maxval= np.sqrt(6 / input_shape[-1]) / self.omega_0))
  
  def call(self, x):
    return tf.math.sin(self.omega_0 * self.linear(x))

In [None]:
class Siren(Layer):
  def __init__(self, hidden_features, hidden_layers, out_features, outermost_linear=False, 
                 first_omega_0=30, hidden_omega_0=30.):
    super().__init__()
    self.hidden_layers = hidden_layers
    self.hidden_features = hidden_features
    self.first_omega_0 = first_omega_0
    self.hidden_omega_0 = hidden_omega_0
    self.out_features = out_features
    self.net = Sequential()
    self.outermost_linear = outermost_linear

  def build(self, input_shape):
    self.net.add(SineLayer(self.hidden_features, 
                                  is_first=True, omega_0=self.first_omega_0))
    for i in range(self.hidden_layers):
      self.net.add(SineLayer(self.hidden_features, 
                                      is_first=False, omega_0=self.hidden_omega_0))
    if self.outermost_linear:
      self.net.add(Dense(self.out_features,
                        input_shape = input_shape,
                        kernel_initializer = tf.keras.initializers.RandomUniform(minval= -np.sqrt(6 / input_shape[-1]) / self.hidden_omega_0, 
                                                                                 maxval= np.sqrt(6 / input_shape[-1]) / self.hidden_omega_0)))
    else:
      self.net.add(SineLayer(self.out_features, 
                                      is_first=False, omega_0=self.hidden_omega_0))
  def call(self, x):
    x = self.net(x)
    return x


Obtenemos las canciones
**Nota:** La otra cacnción se encuentra en el github

In [None]:
!wget https://vsitzmann.github.io/siren/img/audio/gt_bach.wav

--2021-07-22 03:15:41--  https://vsitzmann.github.io/siren/img/audio/gt_bach.wav
Resolving vsitzmann.github.io (vsitzmann.github.io)... 185.199.108.153, 185.199.109.153, 185.199.110.153, ...
Connecting to vsitzmann.github.io (vsitzmann.github.io)|185.199.108.153|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1232886 (1.2M) [audio/wav]
Saving to: ‘gt_bach.wav’


2021-07-22 03:15:41 (27.2 MB/s) - ‘gt_bach.wav’ saved [1232886/1232886]



Procesamos los audios

In [None]:
audio, _ = librosa.load('ludovico-einaudi.wav')
audio = np.expand_dims(audio, -1)
audio_1, _ = librosa.load('gt_bach.wav')
audio_1 = np.expand_dims(audio_1, -1)
audio = np.resize(audio, audio_1.shape)
print(audio.shape, audio_1.shape)

(154104, 1) (154104, 1)


Creamos nuestro modelo

In [None]:
def create_model():
  input = Input(shape=(1,))
  siren = Siren(256,3, 1, first_omega_0=30000, outermost_linear=True) #El omega se cambia para poder escalar la amplitud de los audios
  x = siren(input)
  model = Model(inputs = input, outputs = x)
  return model

In [None]:
optimizer = tf.optimizers.Adam(learning_rate=1e-4)
model = create_model()
model.summary()

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 1)]               0         
_________________________________________________________________
siren (Siren)                (None, 1)                 198145    
Total params: 198,145
Trainable params: 198,145
Non-trainable params: 0
_________________________________________________________________


In [None]:
def train_step(inputs, model, optimizer):
    with tf.GradientTape() as tape:
      prediction = model(inputs)
      mse = tf.keras.losses.MeanSquaredError()
      loss = mse(inputs, prediction)
    gradients = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(gradients, model.trainable_variables))
    return loss

Entrenamos

In [None]:
total_steps = 1000 
steps_til_summary = 100
for step in range(total_steps):
  loss = train_step(audio, model, optimizer)
  if not step % steps_til_summary:
      print("Step %d, Total loss %0.6f" % (step, loss))

Step 0, Total loss 0.257903
Step 100, Total loss 0.001099
Step 200, Total loss 0.000688
Step 300, Total loss 0.000936
Step 400, Total loss 0.000089
Step 500, Total loss 0.000079
Step 600, Total loss 0.003822
Step 700, Total loss 0.000158
Step 800, Total loss 0.000240
Step 900, Total loss 0.000065


Audio Original

In [None]:
Audio('ludovico-einaudi.wav')

Audio reconstruido

In [None]:
final_output = model(audio)
final_output = tf.squeeze(final_output)
Audio(final_output.numpy(), rate=22050)

# Interpolación

Sacamos los gradientes de cada canción y los fusionamos de acuerdo a la siguiente formula

In [None]:
grad = np.gradient(audio, axis = 0)
grad_1 = np.gradient(audio_1, axis = 0)
grads = grad + grad_1*(1-0.8)

In [None]:
total_steps = 1000 
steps_til_summary = 100
for step in range(total_steps):
  loss = train_step(grads, model, optimizer)
  if not step % steps_til_summary:
      print("Step %d, Total loss %0.6f" % (step, loss))

Step 0, Total loss 0.000066
Step 100, Total loss 0.000009
Step 200, Total loss 0.000010
Step 300, Total loss 0.000004
Step 400, Total loss 0.000084
Step 500, Total loss 0.000012
Step 600, Total loss 0.000024
Step 700, Total loss 0.000034
Step 800, Total loss 0.000060
Step 900, Total loss 0.000059


Audio interpolado

In [None]:
final_output = model(grads)
final_output = tf.squeeze(final_output)
Audio(final_output.numpy(), rate=22050)