<a href="https://colab.research.google.com/github/Abhishek3102/Variational-AutoEncoders/blob/main/Change_Singers_Voice.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install librosa torch torchvision matplotlib numpy



In [2]:
!ls

Atif_Aslam  Honey_Singh  preprocessed_Atif_Aslam  preprocessed_Honey_Singh  sample_data  ss  ss.zip


In [3]:
!unzip ss.zip

Archive:  ss.zip
   creating: Atif_Aslam/
  inflating: Atif_Aslam/Dil Diyan Gallan Tiger Zinda Hai 128 Kbps.mp3  
  inflating: Atif_Aslam/Jeene Laga Hoon Bollywood Sing Along - Ramaiya Vastavaiya - Girish Kumar, Sh.mp3  
  inflating: Atif_Aslam/MAIN_RANG_SARBOTO_KA...FULL_SONG(128k).mp3  
  inflating: Atif_Aslam/Pehli Nazar Main-(Mr-Jatt.com).mp3  
  inflating: Atif_Aslam/Piya O Re Piya-(Mr-Jatt.com).mp3  
  inflating: Atif_Aslam/Tere Sang Yaara-(Mr-Jatt.com).mp3  
  inflating: Atif_Aslam/Tere_Liye_Lyrical_-_Prince__Vivek_Oberoi_&_Aruna_Sheilds__Atif_Aslam,_Shreya_Ghoshal(256k).mp3  
  inflating: Atif_Aslam/Tu_Jaane_Na_Lyrical_Video-_Ajab_Prem_Ki_Ghazab_Kahani__Atif_Aslam__Ranbir_Kapoor,_Katrina_Kaif(128k).mp3  
   creating: Honey_Singh/
  inflating: Honey_Singh/024 Brown Rang (INTERNATIONAL VILLAGER).mp3  
  inflating: Honey_Singh/Alcoholic Full Video _ The Shaukeens _ Yo Yo Honey Singh _ Akshay Kumar & Lisa Haydon.mp3  
  inflating: Honey_Singh/Blue Eyes Yo Yo Honey Singh 128 Kbps.mp

In [4]:
import zipfile
import os
zip_path = "/content/ss.zip"
extract_path = "/content/ss"
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_path)
print(f"folder extracted to: {extract_path}")

folder extracted to: /content/ss


In [5]:
import mimetypes
mime_type, encoding = mimetypes.guess_type(zip_path)
print(mime_type)  # This should print "application/zip" if it's a valid ZIP file

application/zip


In [6]:
import os
import librosa
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader

In [3]:
import torch.nn as nn

class AudioEncoder(nn.Module):
    def __init__(self, latent_dim=128):
        super().__init__()
        self.conv = nn.Sequential(
            nn.Conv2d(1, 32, kernel_size=3, stride=2, padding=1),
            nn.ReLU(),
            nn.Conv2d(32, 64, kernel_size=3, stride=2, padding=1),
            nn.ReLU()
        )
        self.fc_mu = nn.Linear(64 * 64 * 64, latent_dim)
        self.fc_logvar = nn.Linear(64 * 64 * 64, latent_dim)

    def forward(self, x):
        x = self.conv(x)
        x = x.view(x.size(0), -1)
        mu = self.fc_mu(x)
        logvar = self.fc_logvar(x)
        return mu, logvar

class AudioDecoder(nn.Module):
    def __init__(self, latent_dim=128):
        super().__init__()
        self.fc = nn.Linear(latent_dim, 64 * 64 * 64)
        self.conv = nn.Sequential(
            nn.ConvTranspose2d(64, 32, kernel_size=3, stride=2, padding=1, output_padding=1),
            nn.ReLU(),
            nn.ConvTranspose2d(32, 1, kernel_size=3, stride=2, padding=1, output_padding=1),
            nn.Sigmoid()
        )

    def forward(self, z):
        z = self.fc(z)
        z = z.view(z.size(0), 64, 64, 64)
        return self.conv(z)

class VAE(nn.Module):
    def __init__(self, latent_dim=128):
        super().__init__()
        self.encoder = AudioEncoder(latent_dim)
        self.decoder = AudioDecoder(latent_dim)

    def forward(self, x):
        mu, logvar = self.encoder(x)
        z = self.reparameterize(mu, logvar)
        return self.decoder(z), mu, logvar

    def reparameterize(self, mu, logvar):
        std = torch.exp(0.5 * logvar)
        eps = torch.randn_like(std)
        return mu + eps * std

In [4]:
import torch
import numpy as np
import os
import librosa

class AudioDataset(torch.utils.data.Dataset):
    def __init__(self, spectrogram_dir, target_length=29049):
        self.files = [os.path.join(spectrogram_dir, f) for f in os.listdir(spectrogram_dir) if f.endswith('.npy')]
        self.target_length = target_length

    def __len__(self):
        return len(self.files)

    def __getitem__(self, idx):
        spectrogram = np.load(self.files[idx])

        current_length = spectrogram.shape[-1]

        if current_length < self.target_length:
            pad_width = self.target_length - current_length
            spectrogram = np.pad(spectrogram, ((0, 0), (0, 0), (0, pad_width)), mode='constant')
        elif current_length > self.target_length:
            spectrogram = spectrogram[:, :, :self.target_length]

        spectrogram = (spectrogram - spectrogram.min()) / (spectrogram.max() - spectrogram.min())

        spectrogram = torch.tensor(spectrogram).unsqueeze(0).float()

        print(f"Spectrogram shape after padding/truncating: {spectrogram.shape}")
        return spectrogram

In [5]:
import os
import librosa
import numpy as np
import soundfile as sf

def preprocess_audio(input_dir, output_dir, target_duration_sec=120, sr=22050):
    os.makedirs(output_dir, exist_ok=True)

    target_samples = target_duration_sec * sr

    for file in os.listdir(input_dir):
        if file.endswith(('.mp3', '.wav')):
            file_path = os.path.join(input_dir, file)
            y, _ = librosa.load(file_path, sr=sr)

            current_length = len(y)

            if current_length < target_samples:
                pad_length = target_samples - current_length
                y = np.pad(y, (0, pad_length), 'constant')
            elif current_length > target_samples:
                y = y[:target_samples]

            output_file_path = os.path.join(output_dir, file)
            sf.write(output_file_path, y, sr)

            print(f"Processed file: {file} | Original length: {current_length / sr:.2f} sec | Target length: {target_samples / sr:.2f} sec")
preprocess_audio('/content/Honey_Singh', 'preprocessed_Honey_Singh', target_duration_sec=120, sr=22050)
preprocess_audio('/content/Atif_Aslam', 'preprocessed_Atif_Aslam', target_duration_sec=120, sr=22050)

Processed file: Dheere_Dheere_Se_Meri_Zindagi_Video_Song_(OFFICIAL)_Hrithik_Roshan,_Sonam_Ka.mp3 | Original length: 304.00 sec | Target length: 120.00 sec
Processed file: Alcoholic Full Video _ The Shaukeens _ Yo Yo Honey Singh _ Akshay Kumar & Lisa Haydon.mp3 | Original length: 194.44 sec | Target length: 120.00 sec
Processed file: Dope Shope International Villager 128 Kbps.mp3 | Original length: 188.61 sec | Target length: 120.00 sec
Processed file: [Songs.PK] Bhoothnath 03 - Party With Bhoothnath.mp3 | Original length: 321.46 sec | Target length: 120.00 sec
Processed file: 024 Brown Rang (INTERNATIONAL VILLAGER).mp3 | Original length: 179.17 sec | Target length: 120.00 sec
Processed file: Party All Night Boss 128 Kbps.mp3 | Original length: 282.82 sec | Target length: 120.00 sec
Processed file: Blue Eyes Yo Yo Honey Singh 128 Kbps.mp3 | Original length: 221.00 sec | Target length: 120.00 sec
Processed file: Pehli Nazar Main-(Mr-Jatt.com).mp3 | Original length: 311.88 sec | Target le

In [7]:
import torch
import torch.nn as nn
import librosa
import numpy as np
import os
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim
import torch_xla.core.xla_model as xm
import torch_xla.debug.metrics as met

class AudioDataset(Dataset):
    def __init__(self, audio_dir, target_length=29049, sr=22050):
        self.audio_dir = audio_dir
        self.target_length = target_length
        self.sr = sr
        self.files = [f for f in os.listdir(audio_dir) if f.endswith(('.mp3', '.wav'))]

    def __len__(self):
        return len(self.files)

    def __getitem__(self, idx):
        file_path = os.path.join(self.audio_dir, self.files[idx])
        y, sr = librosa.load(file_path, sr=self.sr)

        mel_spec = librosa.feature.melspectrogram(y=y, sr=sr, n_fft=2048, hop_length=512)

        log_mel_spec = librosa.power_to_db(mel_spec, ref=np.max)

        if log_mel_spec.shape[1] < self.target_length:
            pad_width = self.target_length - log_mel_spec.shape[1]
            log_mel_spec = np.pad(log_mel_spec, ((0, 0), (0, pad_width)), mode='constant')
        elif log_mel_spec.shape[1] > self.target_length:
            log_mel_spec = log_mel_spec[:, :self.target_length]

        return torch.tensor(log_mel_spec, dtype=torch.float32).unsqueeze(0)

class AudioEncoder(nn.Module):
    def __init__(self, latent_dim=64):
        super().__init__()
        self.conv = nn.Sequential(
            nn.Conv2d(1, 16, kernel_size=3, stride=2, padding=1),
            nn.ReLU(),
            nn.Conv2d(16, 32, kernel_size=3, stride=2, padding=1),
            nn.ReLU()
        )
        self.fc_mu = nn.Linear(32 * 32 * 7263, latent_dim)
        self.fc_logvar = nn.Linear(32 * 32 * 7263, latent_dim)

    def forward(self, x):
        x = self.conv(x)
        x = x.view(x.size(0), -1)
        mu = self.fc_mu(x)
        logvar = self.fc_logvar(x)
        return mu, logvar

class AudioDecoder(nn.Module):
    def __init__(self, latent_dim=64):
        super().__init__()
        self.fc = nn.Linear(latent_dim, 32 * 32 * 7263)
        self.conv = nn.Sequential(
            nn.ConvTranspose2d(32, 16, kernel_size=3, stride=2, padding=1, output_padding=1),  # Reduce filters
            nn.ReLU(),
            nn.ConvTranspose2d(16, 1, kernel_size=3, stride=2, padding=1, output_padding=1),  # Reduce filters
            nn.Sigmoid()
        )

    def forward(self, z):
        z = self.fc(z)
        z = z.view(z.size(0), 32, 32, 7263)
        return self.conv(z)

class VAE(nn.Module):
    def __init__(self, latent_dim=64):
        super().__init__()
        self.encoder = AudioEncoder(latent_dim)
        self.decoder = AudioDecoder(latent_dim)

    def forward(self, x):
        mu, logvar = self.encoder(x)
        z = self.reparameterize(mu, logvar)
        return self.decoder(z), mu, logvar

    def reparameterize(self, mu, logvar):
        std = torch.exp(0.5 * logvar)
        eps = torch.randn_like(std)
        return mu + eps * std

def vae_loss(recon_x, x, mu, logvar, beta=0.001):
    recon_loss = nn.functional.mse_loss(recon_x, x, reduction='sum')
    kl_div = -0.5 * torch.sum(1 + logvar - mu.pow(2) - logvar.exp())
    return recon_loss + beta * kl_div

singer_A_dataset = AudioDataset('preprocessed_Honey_Singh', target_length=29049)
singer_B_dataset = AudioDataset('preprocessed_Atif_Aslam', target_length=29049)

dataset = torch.utils.data.ConcatDataset([singer_A_dataset, singer_B_dataset])
dataloader = torch.utils.data.DataLoader(dataset, batch_size=8, shuffle=True)

device = xm.xla_device()
vae = VAE(latent_dim=64).to(device)
optimizer = optim.Adam(vae.parameters(), lr=1e-3)

for epoch in range(20):
    vae.train()
    train_loss = 0
    optimizer.zero_grad()

    for i, spectrogram in enumerate(dataloader):
        spectrogram = spectrogram.to(device)


        recon, mu, logvar = vae(spectrogram)

        loss = vae_loss(recon, spectrogram, mu, logvar)

        loss.backward()

        if (i + 1) % 4 == 0:
            optimizer.step()
            optimizer.zero_grad()

        train_loss += loss.item()

    print(f'Epoch {epoch + 1}, Loss: {train_loss / len(dataloader.dataset)}')
    metrics = met.metrics_report()
    print(f"TPU Metrics: {metrics}")

RuntimeError: Bad StatusOr access: RESOURCE_EXHAUSTED: Error allocating device buffer: Attempting to allocate 1.77G. That was not possible. There are 370.19M free.; (0x0x0_HBM0)