In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import os
import subprocess
import numpy as np
import pandas as pd
import librosa
import psutil
import pickle
import gc  # Import gc for garbage collection
import tensorflow as tf
import keras
from tensorflow.keras.layers import Conv2D, BatchNormalization, Activation, MaxPooling2D, UpSampling2D, Input
from tensorflow.keras.models import Model
from sklearn.model_selection import train_test_split

# Define the path to the directory containing MP4 files
input_dir = '/kaggle/input/song-test'
output_dir = '/kaggle/working/extracted_tracks'

# Create output directory if it doesn't exist
os.makedirs(output_dir, exist_ok=True)

In [None]:
def extract_audio_tracks(mp4_file, output_dir):
    audio_tracks = []
    for i in range(5):  # Assuming there are 5 audio tracks (0 to 4)
        output_file = os.path.join(output_dir, f'{os.path.basename(mp4_file)}_track{i}.mp3')
        subprocess.run(['ffmpeg', '-i', mp4_file, '-map', f'0:a:{i}', output_file])
        audio_tracks.append(output_file)
    return audio_tracks

In [None]:
def process_audio_track(audio_file, sr=22050, duration=1):
    y, _ = librosa.load(audio_file, sr=sr)
    total_length = len(y)
    block_length = sr * duration
    blocks = []

    for i in range(0, total_length, block_length):
        block = y[i:i + block_length]
        if len(block) < block_length:
            block = np.pad(block, (0, block_length - len(block)), mode='constant')
        blocks.append(block)

    return blocks

In [None]:
def perform_stft(y, n_fft=500, hop_length=125):   #n_fft=2048 hop_length=125 too big for computation
    return librosa.stft(y, n_fft=n_fft, hop_length=hop_length)

In [None]:
def prepare_complex_data(data):
    data_real = np.real(data)
    data_imag = np.imag(data)
    data_combined = np.concatenate((data_real, data_imag), axis=-1)
    return data_combined

In [None]:
# extract_audio_tracks('/kaggle/input/song-test/subway-mirage-261477.mp3',output_dir)

In [None]:
data = []

blocks = process_audio_track('/kaggle/input/song-test/subway-mirage-261477.mp3',duration=1)
for block_idx, block in enumerate(blocks):
    stft_matrix = perform_stft(block,n_fft=500,hop_length=125)
    data.append(stft_matrix)

In [None]:
import numpy as np


# Step 1: Ensure each array has the shape (254, 177) and expand dimensions
expanded_data = [np.expand_dims(arr, axis=-1) for arr in data]

# Step 2: Stack these arrays into a single NumPy array
final_array = np.stack(expanded_data, axis=0)

In [None]:
X=prepare_complex_data(final_array)

In [None]:
X.shape

In [None]:
model = tf.keras.models.load_model('/kaggle/input/trained-model/model_batch_1 (1).h5')

In [None]:
predictions= model.predict(X)

In [None]:
import numpy as np

# Separate real and imaginary parts
real_parts = predictions[..., 0::2]  # Take every second element starting from 0
imaginary_parts = predictions[..., 1::2]  # Take every second element starting from 1

# Combine to form complex array
predicted_array = real_parts + 1j * imaginary_parts

In [None]:
predicted_array = predicted_array[..., 0]
print(predicted_array.shape)  # Output: (131, 251, 177)


In [None]:
import numpy as np
import librosa
import soundfile as sf

# Original sample rate of the audio
original_sr = 48000
# Define ISTFT function
def istft(stft_matrix):
    """
    Convert STFT matrix back to a time-domain signal.
    """
    return librosa.istft(stft_matrix, hop_length=125)

# Initialize a list to store the reconstructed signals
reconstructed_signals = []

# Iterate over the 131 predicted arrays (assuming each is a separate signal to reconstruct)
for i in range(131):
    # Extract one STFT matrix
    stft_matrix = predicted_array[i, :, :]

    # Perform inverse STFT
    time_signal = istft(stft_matrix)

    # Resample to the original frame rate
    resampled_signal = librosa.resample(time_signal, orig_sr=22050, target_sr=original_sr)

    # Append the reconstructed signal to the list
    reconstructed_signals.append(resampled_signal.astype(np.float32))

# Concatenate all reconstructed signals into one
final_signal = np.concatenate(reconstructed_signals, axis=0)

# Save the concatenated signal as a WAV file
sf.write('reconstructed_audio.wav', final_signal, original_sr, subtype='PCM_16')

print("Reconstructed audio saved as 'reconstructed_audio.wav'.")


In [None]:
from IPython.display import Audio

# Path to your audio file
audio_file_path = 'reconstructed_audio.wav'

# Play the audio
Audio(audio_file_path)
