# Stem Seperation using ICA

In [None]:
# Downloading a song into the workspace

from google.colab import files

song = files.upload()

Saving Sililara_Sitha_Nayana_-_Soorya_Nagare_Athula_Adikari_Sarigama_lk.mp3 to Sililara_Sitha_Nayana_-_Soorya_Nagare_Athula_Adikari_Sarigama_lk.mp3


In [None]:
!pip install pydub librosa soundfile

Collecting pydub
  Downloading pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB)
Downloading pydub-0.25.1-py2.py3-none-any.whl (32 kB)
Installing collected packages: pydub
Successfully installed pydub-0.25.1


In [None]:
# Converting the MP3 into wav format
from pydub import AudioSegment

# Load the MP3 file
audio_path = '/content/Sililara_Sitha_Nayana_-_Soorya_Nagare_Athula_Adikari_Sarigama_lk.mp3'
audio = AudioSegment.from_mp3(audio_path)

# Export as WAV
audio.export('audio.wav', format='wav')


<_io.BufferedRandom name='audio.wav'>

In [None]:
import numpy as np
import librosa
import soundfile as sf
from sklearn.decomposition import FastICA

# Load the WAV file
data, samplerate = librosa.load('audio.wav', sr=None, mono=False)

# Ensure the data is two-dimensional (stereo)
if data.ndim == 1:
    data = np.expand_dims(data, axis=1)

# Perform ICA
ica = FastICA(n_components=2, random_state=0)
sources = ica.fit_transform(data.T)  # Reconstruct signals, transpose for correct shape

# Save the separated sources as audio files
sf.write('source1.wav', sources[:, 0], samplerate)
sf.write('source2.wav', sources[:, 1], samplerate)


In [None]:
import librosa
import numpy as np
import matplotlib.pyplot as plt

# Load the WAV file
y, sr = librosa.load('audio.wav', sr=None)

# Compute the STFT
S = np.abs(librosa.stft(y))

# Display the magnitude spectrogram
plt.figure(figsize=(10, 6))
librosa.display.specshow(librosa.amplitude_to_db(S, ref=np.max), sr=sr, x_axis='time', y_axis='log')
plt.colorbar(format='%+2.0f dB')
plt.title('Magnitude Spectrogram')
plt.show()


In [None]:
from sklearn.decomposition import NMF

# Number of components (stems) to extract
n_components = 2

# Apply NMF
model = NMF(n_components=n_components, init='random', random_state=0)
W = model.fit_transform(S)  # Basis components
H = model.components_  # Activations

# Reconstruct the separated signals
reconstructed_signals = []
for i in range(n_components):
    S_i = np.outer(W[:, i], H[i, :])
    y_i = librosa.istft(S_i)
    reconstructed_signals.append(y_i)


In [None]:
import soundfile as sf
import IPython.display as ipd

# Save separated stems
for i, y_i in enumerate(reconstructed_signals):
    stem_path = f'stem_{i+1}.wav'
    sf.write(stem_path, y_i, sr)
    print(f'Stem {i+1} saved as {stem_path}')

# Play the first stem
ipd.Audio(reconstructed_signals[0], rate=sr)


In [None]:
import librosa
import librosa.display
import numpy as np
import matplotlib.pyplot as plt

# Load the WAV file
y, sr = librosa.load('audio.wav', sr=None)

# Compute the Short-Time Fourier Transform (STFT)
S = np.abs(librosa.stft(y))

# Log-scale the spectrogram
S_db = librosa.amplitude_to_db(S, ref=np.max)

# Display the spectrogram
plt.figure(figsize=(10, 6))
librosa.display.specshow(S_db, sr=sr, x_axis='time', y_axis='log')
plt.colorbar(format='%+2.0f dB')
plt.title('Log-Scaled Spectrogram')
plt.show()


In [None]:
# Create input features (spectrogram patches) and labels (isolated source spectrograms)
# This example assumes you have paired data (mixture and source)
def create_dataset(mixture, source, patch_size):
    X, y = [], []
    for i in range(0, mixture.shape[1] - patch_size, patch_size):
        X.append(mixture[:, i:i + patch_size])
        y.append(source[:, i:i + patch_size])
    return np.array(X), np.array(y)

patch_size = 128  # You can adjust this size
X, y = create_dataset(S_db, S_db, patch_size)  # For demo purposes, we use the same spectrogram as a placeholder


In [None]:
import tensorflow as tf
from tensorflow.keras import layers, models

# Define the CNN model
def build_cnn(input_shape):
    model = models.Sequential()
    model.add(layers.Conv2D(32, (3, 3), activation='relu', input_shape=input_shape))
    model.add(layers.MaxPooling2D((2, 2)))
    model.add(layers.Conv2D(64, (3, 3), activation='relu'))
    model.add(layers.MaxPooling2D((2, 2)))
    model.add(layers.Conv2D(128, (3, 3), activation='relu'))
    model.add(layers.MaxPooling2D((2, 2)))
    model.add(layers.Conv2D(256, (3, 3), activation='relu'))
    model.add(layers.Flatten())
    model.add(layers.Dense(1024, activation='relu'))
    model.add(layers.Dense(np.prod(input_shape), activation='linear'))  # Output the spectrogram
    model.add(layers.Reshape(input_shape))  # Reshape the output to the original spectrogram shape
    return model

# Initialize the model
input_shape = (X.shape[1], X.shape[2], 1)  # e.g., (128, 128, 1) for mono spectrogram patches
model = build_cnn(input_shape)

# Compile the model
model.compile(optimizer='adam', loss='mean_squared_error')

# Summary of the model
model.summary()
