In [None]:
from matplotlib.pyplot import imshow
from PIL import Image
import librosa
import soundfile
import numpy as np
import tensorflow as tf
from tensorflow.python.framework.ops import EagerTensor
import pprint
import librosa
import matplotlib.pyplot as plt
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Conv1D, Input
from keras.models import Model
%matplotlib inline

In [None]:
tf.config.run_functions_eagerly(True)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
img_size = 512

In [None]:
def wav_to_mel(file_path):
  y, sr = librosa.load(file_path)
  audio, _ = librosa.effects.trim(y)
  n_fft=2048
  hop_length=512
  n_mels=128
  S = librosa.feature.melspectrogram(audio, sr=sr, n_fft=n_fft, 
                                  hop_length=hop_length, n_mels=n_mels)
  im =Image.fromarray(S).convert('F') 
  im.show()
  output_path = file_path.split('.')[0]+'.tiff'
  im.save(file_path.split('.')[0]+'.tiff')
  return output_path, sr

In [None]:
def mel_to_wav(file_path, sr):
  im=Image.open(file_path) 
  img=np.array(im)
  n_fft=2048
  hop_length=512
  n_mels=128
  wav=librosa.feature.inverse.mel_to_audio(img, sr=sr, n_fft=n_fft, hop_length=hop_length)
  print(img)
  soundfile.write(file_path.split('.')[0]+'_ext.wav',wav,samplerate=sr)

In [None]:
content_mel__path, content_sampeling_rate = wav_to_mel("/content/drive/MyDrive/ML-Project/input_wav/classical00000.wav")
style_mel_path, style_sampeling_rate = wav_to_mel("/content/drive/MyDrive/ML-Project/input_wav/jazz00000.wav")

In [None]:
content_image = np.array(Image.open(content_mel__path))
content_image = np.expand_dims(content_image, axis=-1)
input_shape = content_image.shape
print(content_image.shape)
content_image = tf.constant(np.reshape(content_image , ((1, ) + content_image.shape)))
print(content_image.shape)

In [None]:
style_image = np.array(Image.open(style_mel_path))
style_image = np.expand_dims(style_image, axis=-1)
print(style_image.shape)
style_image = tf.constant(np.reshape(style_image, ((1, ) + style_image.shape)))
print(style_image.shape)

In [None]:
input_shape = input_shape

conv_model = Sequential()
conv_model.add(Input(shape=input_shape))
conv_model.add(Conv1D(256, 4, activation='relu', input_shape=input_shape[1:]))
conv_model.add(Conv1D(128, 4, activation='relu', input_shape=input_shape[1:]))

conv_model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
conv_model.summary()

In [None]:
def compute_content_cost(content_output, generated_output):
    #Here to calculate content cost we just need the output of final layer of the model.
    #The final layer of the model can be considered as an embedding of the input image
    a_C = content_output[-1]
    a_G = generated_output[-1]
    
    m, n_H, n_W, n_C = a_G.get_shape().as_list()
    
    #Unrolling a_C and a_G for calculating content loss
    a_C_unrolled = tf.reshape(tf.transpose(a_C, perm=[0, 3, 1, 2]), shape=[n_C, -1])
    a_G_unrolled = tf.reshape(tf.transpose(a_G, perm=[0, 3, 1, 2]), shape=[n_C, -1])
    
    J_content = (1/(4*n_H*n_W*n_C))*tf.reduce_sum(tf.square(tf.subtract(a_C_unrolled, a_G_unrolled)))
    
    return J_content

In [None]:
def gram_matrix(A):
    GA = tf.linalg.matmul(A, tf.transpose(A))
    return GA

In [None]:
def compute_layer_style_cost(a_S, a_G):
    m, n_H, n_W, n_C = a_G.get_shape().as_list()
    
    a_S = tf.reshape(tf.transpose(a_S, perm=[0, 3, 1, 2]), shape=[n_C, -1])
    a_G = tf.reshape(tf.transpose(a_G, perm=[0, 3, 1, 2]), shape=[n_C, -1])

    GS = gram_matrix(a_S)
    GG = gram_matrix(a_G)

    J_style_layer = (1. / (4 * (n_C**2) * (n_H * n_W)**2) )*tf.reduce_sum(tf.square(tf.subtract(GS, GG)))
        
    return J_style_layer

In [None]:
for layer in conv_model.layers:
    print(layer.name)

In [None]:
STYLE_LAYERS = [
    ('conv1d', 0.5),
    ('conv1d_1', 0.5)
]

In [None]:
def compute_style_cost(style_image_output, generated_image_output, STYLE_LAYERS=STYLE_LAYERS):
    J_style = 0

    # Set a_S to be the hidden layer activation from the layer we have selected.
    # The last element of the array contains the content layer image, which must not be used.
    a_S = style_image_output[:-1]

    # Set a_G to be the output of the choosen hidden layers.
    # The last element of the list contains the content layer image which must not be used.
    a_G = generated_image_output[:-1]
    for i, weight in zip(range(len(a_S)), STYLE_LAYERS):  
        # Compute style_cost for the current layer
        J_style_layer = compute_layer_style_cost(a_S[i], a_G[i])

        # Add weight * J_style_layer of this layer to overall style cost
        J_style += weight[1] * J_style_layer

    return J_style

In [None]:
@tf.function()
def total_cost(J_content, J_style, alpha = 10, beta = 40):
    J = alpha * J_content + beta * J_style

    return J

In [None]:
generated_image = tf.Variable(tf.image.convert_image_dtype(content_image, tf.float32))
noise = tf.random.uniform(tf.shape(generated_image), -0.25, 0.25)
generated_image = tf.add(generated_image, noise)
generated_image = tf.clip_by_value(generated_image, clip_value_min=0.0, clip_value_max=1.0)

print(generated_image.shape)

In [None]:
model = Model(inputs=conv_model.input,
              outputs=[conv_model.get_layer(layer_name).output for layer_name in ['conv1d', 'conv1d_1']])

In [None]:
preprocessed_content = tf.Variable(tf.image.convert_image_dtype(content_image, tf.float32))
a_C = model(preprocessed_content) #Getting outputs of the selected layer for content image

In [None]:
preprocessed_style =  tf.Variable(tf.image.convert_image_dtype(style_image, tf.float32))
a_S = model(preprocessed_style) #Getting outputs of the selected layer for content image

In [None]:
preprocessed_content

In [None]:
preprocessed_style

In [None]:
optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)

@tf.function()
def train_step(generated_image):
    with tf.GradientTape() as tape:
        a_G = model(generated_image)
        
        J_style = compute_style_cost(a_S, a_G)

        J_content = compute_content_cost(a_C, a_G)
        
        J = total_cost(J_content, J_style, alpha=10, beta=40)
                
    grad = tape.gradient(J, generated_image)

    optimizer.apply_gradients([(grad, generated_image)])
    generated_image.assign(tf.clip_by_value(generated_image, clip_value_min=0.0, clip_value_max=1.0))
    return J

In [None]:
def tensor_to_image(tensor):
  tensor = tensor * 255
  tensor = np.array(tensor, dtype=np.uint8)
  if np.ndim(tensor) > 3:
      assert tensor.shape[0] == 1
      tensor = tensor[0]
      tensor = tensor[:, :, 0]
  return Image.fromarray(tensor).convert('F')

In [None]:
generated_image = tf.Variable(generated_image)
print(generated_image.shape)

epochs = 10000
for i in range(epochs):
  print(f"Epoch no {i}")
  train_step(generated_image)
  if i % 250 == 0:
    image = tensor_to_image(generated_image)
    imshow(image)
    image.save(f"/content/drive/MyDrive/ML-Project/output_image/image_{i}.tiff")
    plt.show() 

In [None]:
generated_file_path = ""
mel_to_wav("/content/drive/MyDrive/ML-Project/output_image/image_9750.tiff", content_sampeling_rate)