In [None]:
import os
import librosa
import numpy as np
import soundfile as sf

In [None]:
mp3_dir = 'Music Data/New_mp3/'
mel_dir = 'mel_spectrograms'
os.makedirs(mel_dir, exist_ok=True)

mp3_files = [f for f in os.listdir(mp3_dir) if f.endswith('.mp3')]

for mp3_file in mp3_files:
    file_path = os.path.join(mp3_dir, mp3_file)
    print(f"Processing {mp3_file}...")
    try:
        y, sr = librosa.load(file_path, sr=22050)
        mel = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=128)
        mel_db = librosa.power_to_db(mel, ref=np.max)

        out_file = os.path.splitext(mp3_file)[0] + '.npy'
        np.save(os.path.join(mel_dir, out_file), mel_db)
    except Exception as e:
        print(f"Failed on {mp3_file}: {e}")

In [None]:
import matplotlib.pyplot as plt
import librosa.display

mel_file = np.load(os.path.join(mel_dir, os.listdir(mel_dir)[0]))

plt.figure(figsize=(10, 4))
librosa.display.specshow(mel_file, x_axis='time', y_axis='mel', sr=16000)
plt.colorbar(format='%+2.0f dB')
plt.title('Mel Spectrogram')
plt.tight_layout()
plt.show()

In [None]:
import numpy as np
import os

mel_dir = 'mel_spectrograms'
mel_data = []

for file in os.listdir(mel_dir):
    if file.endswith('.npy'):
        mel = np.load(os.path.join(mel_dir, file))
        # Pad or crop to fixed length (e.g., 128 x 256)
        if mel.shape[1] >= 256:
            mel = mel[:, :256]
        else:
            pad_width = 256 - mel.shape[1]
            mel = np.pad(mel, ((0,0), (0,pad_width)), mode='constant')

        mel_data.append(mel)

mel_data = np.array(mel_data)
mel_data = (mel_data - mel_data.min()) / (mel_data.max() - mel_data.min())  

print("Spectrogram data shape:", mel_data.shape)  

In [None]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset

# Convert to PyTorch tensors
mel_tensor = torch.tensor(mel_data, dtype=torch.float32).unsqueeze(1)  # (B, 1, 128, 256)
train_loader = DataLoader(TensorDataset(mel_tensor), batch_size=16, shuffle=True)

class VAE(nn.Module):
    def __init__(self, latent_dim=64):
        super().__init__()
        self.encoder = nn.Sequential(
            nn.Conv2d(1, 16, 3, stride=2, padding=1),  # (16, 64, 128)
            nn.ReLU(),
            nn.Conv2d(16, 32, 3, stride=2, padding=1),  # (32, 32, 64)
            nn.ReLU(),
            nn.Flatten()
        )
        self.fc_mu = nn.Linear(32*32*64, latent_dim)
        self.fc_logvar = nn.Linear(32*32*64, latent_dim)
        self.fc_decode = nn.Linear(latent_dim, 32*32*64)
        self.decoder = nn.Sequential(
            nn.Unflatten(1, (32, 32, 64)),
            nn.ConvTranspose2d(32, 16, 3, stride=2, output_padding=1, padding=1),  # (16, 64, 128)
            nn.ReLU(),
            nn.ConvTranspose2d(16, 1, 3, stride=2, output_padding=1, padding=1),  # (1, 128, 256)
            nn.Sigmoid()
        )
    def reparameterize(self, mu, logvar):
        std = torch.exp(0.5 * logvar)
        eps = torch.randn_like(std)
        return mu + eps * std

    def forward(self, x):
        h = self.encoder(x)
        mu = self.fc_mu(h)
        logvar = self.fc_logvar(h)
        z = self.reparameterize(mu, logvar)
        h_decoded = self.fc_decode(z)
        x_hat = self.decoder(h_decoded)
        return x_hat, mu, logvar

model = VAE()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# Loss function
def vae_loss(x_hat, x, mu, logvar):
    recon_loss = nn.functional.mse_loss(x_hat, x)
    kl_loss = -0.5 * torch.sum(1 + logvar - mu.pow(2) - logvar.exp()) / x.size(0)
    return recon_loss + kl_loss

In [None]:
EPOCHS = 10
model.train()

for epoch in range(EPOCHS):
    total_loss = 0
    for batch in train_loader:
        x = batch[0]
        x_hat, mu, logvar = model(x)
        loss = vae_loss(x_hat, x, mu, logvar)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    
    print(f"Epoch {epoch+1}, Loss: {total_loss:.4f}")

In [None]:
model.eval()
with torch.no_grad():
    z = torch.randn(1, 64)  # Sample random latent vector
    h_decoded = model.fc_decode(z)
    gen_mel = model.decoder(h_decoded).squeeze().numpy()

In [None]:
import librosa

# Convert back to waveform using Griffin-Lim
gen_mel = gen_mel * (mel_data.max() - mel_data.min()) + mel_data.min()  # De-normalize
audio = librosa.feature.inverse.mel_to_audio(librosa.db_to_power(gen_mel), sr=22050)

import soundfile as sf
sf.write('output_lofi.wav', audio, samplerate=22050)

Whole Code

In [None]:
import os
import numpy as np
import librosa
import librosa.display
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
import soundfile as sf
from pydub import AudioSegment, effects

# -------------------------------
# STEP 1: Load MP3s and extract mel spectrograms
# -------------------------------
mp3_dir = 'Music Data/New_mp3/'
mel_dir = 'mel_spectrograms'
os.makedirs(mel_dir, exist_ok=True)

mp3_files = [f for f in os.listdir(mp3_dir) if f.endswith('.mp3')]
for mp3_file in mp3_files:
    file_path = os.path.join(mp3_dir, mp3_file)
    print(f"Processing {mp3_file}...")
    try:
        y, sr = librosa.load(file_path, sr=22050)
        mel = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=128)
        mel_db = librosa.power_to_db(mel, ref=np.max)

        out_file = os.path.splitext(mp3_file)[0] + '.npy'
        np.save(os.path.join(mel_dir, out_file), mel_db)
    except Exception as e:
        print(f"Failed on {mp3_file}: {e}")

# -------------------------------
# STEP 2: Load mel spectrograms and prepare dataset
# -------------------------------
mel_data = []
for file in os.listdir(mel_dir):
    if file.endswith('.npy'):
        mel = np.load(os.path.join(mel_dir, file))
        if mel.shape[1] >= 256:
            mel = mel[:, :256]
        else:
            pad_width = 256 - mel.shape[1]
            mel = np.pad(mel, ((0, 0), (0, pad_width)), mode='constant')
        mel_data.append(mel)

mel_data = np.array(mel_data)
mel_data = (mel_data - mel_data.min()) / (mel_data.max() - mel_data.min())  # Normalize to [0,1]

# -------------------------------
# STEP 3: Define VAE model
# -------------------------------
class VAE(nn.Module):
    def __init__(self, latent_dim=64):
        super().__init__()
        self.encoder = nn.Sequential(
            nn.Conv2d(1, 16, 3, stride=2, padding=1),
            nn.ReLU(),
            nn.Conv2d(16, 32, 3, stride=2, padding=1),
            nn.ReLU(),
            nn.Flatten()
        )
        self.fc_mu = nn.Linear(32 * 32 * 64, latent_dim)
        self.fc_logvar = nn.Linear(32 * 32 * 64, latent_dim)
        self.fc_decode = nn.Linear(latent_dim, 32 * 32 * 64)
        self.decoder = nn.Sequential(
            nn.Unflatten(1, (32, 32, 64)),
            nn.ConvTranspose2d(32, 16, 3, stride=2, output_padding=1, padding=1),
            nn.ReLU(),
            nn.ConvTranspose2d(16, 1, 3, stride=2, output_padding=1, padding=1),
            nn.Sigmoid()
        )

    def reparameterize(self, mu, logvar):
        std = torch.exp(0.5 * logvar)
        eps = torch.randn_like(std)
        return mu + eps * std

    def forward(self, x):
        h = self.encoder(x)
        mu = self.fc_mu(h)
        logvar = self.fc_logvar(h)
        z = self.reparameterize(mu, logvar)
        h_decoded = self.fc_decode(z)
        x_hat = self.decoder(h_decoded)
        return x_hat, mu, logvar

# -------------------------------
# STEP 4: Train the VAE
# -------------------------------
mel_tensor = torch.tensor(mel_data, dtype=torch.float32).unsqueeze(1)
train_loader = DataLoader(TensorDataset(mel_tensor), batch_size=16, shuffle=True)
model = VAE()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

def vae_loss(x_hat, x, mu, logvar):
    recon_loss = nn.functional.mse_loss(x_hat, x)
    kl_loss = -0.5 * torch.sum(1 + logvar - mu.pow(2) - logvar.exp()) / x.size(0)
    return recon_loss + kl_loss

EPOCHS = 12
model.train()
for epoch in range(EPOCHS):
    total_loss = 0
    for batch in train_loader:
        x = batch[0]
        x_hat, mu, logvar = model(x)
        loss = vae_loss(x_hat, x, mu, logvar)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1}, Loss: {total_loss:.4f}")

# -------------------------------
# STEP 5: Generate multiple chunks and stitch into 2–3 minute audio
# -------------------------------
model.eval()
generated_chunks = []
num_chunks = 30  # Each chunk ≈ 6s → 30 x 6s ≈ 180s = 3 minutes

with torch.no_grad():
    for _ in range(num_chunks):
        z = torch.randn(1, 64)
        h_decoded = model.fc_decode(z)
        chunk = model.decoder(h_decoded).squeeze().numpy()
        generated_chunks.append(chunk)

gen_mel = np.concatenate(generated_chunks, axis=1)
gen_mel = gen_mel * (mel_data.max() - mel_data.min()) + mel_data.min()  # De-normalize

# -------------------------------
# STEP 6: Convert Mel to Audio
# -------------------------------
audio = librosa.feature.inverse.mel_to_audio(librosa.db_to_power(gen_mel), sr=22050)
sf.write('output_lofi.wav', audio, samplerate=22050)

# -------------------------------
# STEP 7: Add Lo-fi effect (Low-pass filter)
# -------------------------------
sound = AudioSegment.from_wav('output_lofi.wav')
sound = effects.low_pass_filter(sound, cutoff=3500)
sound.export('output_lofi_lofiStyle.wav', format='wav')

print("✅ Generated lo-fi audio: output_lofi_lofiStyle.wav")