In [1]:
from glob import glob
from pathlib import Path
import kagglehub
from utils.audio_loader import load_audio
import warnings
from models.CGANGenerator import Generator
from models.CGANDiscriminator import Discriminator
warnings.filterwarnings("ignore")
from utils.train_CGAN import train_cgan
from utils.audio_utils import AudioDataset
from torch.utils.data import  DataLoader
from utils.audio_utils import MFCCFeatureExtractor
from utils.audio_generator import generate_all_classes
import torch
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# This block of code prepares the GTZAN dataset by identifying the base path,
# then retrieving all audio file paths and the image directory.
#
# - path points to the KaggleHub download cache of the GTZAN dataset.
# - BASE_DIR navigates one level deeper into the dataset version folder.
# - audio_paths collects full paths to all WAV files inside genre folders.
path = Path("/home/awr1u24/.cache/kagglehub/datasets/andradaolteanu/gtzan-dataset-music-genre-classification/versions/1")
if not path.exists():
    print("[INFO] GTZAN dataset not found locally. Downloading from KaggleHub...")
    path = Path(kagglehub.dataset_download("andradaolteanu/gtzan-dataset-music-genre-classification"))
    print("[INFO] Dataset downloaded to:", path)

try:
    BASE_DIR = next(path.iterdir())
except StopIteration:
    raise FileNotFoundError(f"[ERROR] No contents found inside {path}.")

audio_paths = glob(str(BASE_DIR / "genres_original" / "*" / "*.wav"))
if not audio_paths:
    raise FileNotFoundError(f"[ERROR] No audio files found in {BASE_DIR}/genres_original/.")

print(f"[SUCCESS] Loaded {len(audio_paths)} audio files.")

[SUCCESS] Loaded 1000 audio files.


In [3]:
genre_dir = Path(BASE_DIR) / "genres_original"
genres = [p.name for p in genre_dir.iterdir() if p.is_dir()]
label_dict = {i : label for i, label in enumerate(genres)}
audio_np, labels = load_audio(audio_paths, label_dict, sampling_rate=22050)

Finding minimum length:  84%|████████▍ | 838/1000 [00:16<00:02, 74.87it/s]



Finding minimum length: 100%|██████████| 1000/1000 [00:19<00:00, 52.14it/s]
Loading and segmenting audio: 100%|██████████| 999/999 [00:14<00:00, 67.10it/s] 


In [4]:
audio_dataset = AudioDataset(audio_np, labels)
Dataset = DataLoader(audio_dataset, batch_size=32, shuffle=True)
G = Generator(latent_dim=100, num_classes=10, output_dim=66000)
D = Discriminator(input_dim=66000, num_classes=10)

In [5]:
CGAN = train_cgan(generator=G, discriminator=D, dataloader=Dataset,
           num_classes=10, latent_dim=100, epochs=100, device='cuda')

Epoch [1/100]: 100%|██████████| 313/313 [01:07<00:00,  4.64it/s, D_loss=1.32, G_loss=0.732] 
Epoch [2/100]: 100%|██████████| 313/313 [01:05<00:00,  4.79it/s, D_loss=1.18, G_loss=3.21] 
Epoch [3/100]: 100%|██████████| 313/313 [01:04<00:00,  4.84it/s, D_loss=0.528, G_loss=2.64]
Epoch [4/100]: 100%|██████████| 313/313 [01:05<00:00,  4.74it/s, D_loss=0.175, G_loss=4.21]
Epoch [5/100]: 100%|██████████| 313/313 [01:05<00:00,  4.79it/s, D_loss=0.0424, G_loss=4.79]
Epoch [6/100]: 100%|██████████| 313/313 [00:46<00:00,  6.78it/s, D_loss=0.135, G_loss=6.08] 
Epoch [7/100]: 100%|██████████| 313/313 [00:43<00:00,  7.26it/s, D_loss=0.0431, G_loss=5.57]
Epoch [8/100]: 100%|██████████| 313/313 [00:47<00:00,  6.63it/s, D_loss=0.333, G_loss=5.38] 
Epoch [9/100]: 100%|██████████| 313/313 [00:43<00:00,  7.27it/s, D_loss=0.0746, G_loss=4.74] 
Epoch [10/100]: 100%|██████████| 313/313 [00:49<00:00,  6.34it/s, D_loss=0.0982, G_loss=12]  
Epoch [11/100]: 100%|██████████| 313/313 [00:46<00:00,  6.74it/s, D_los

In [6]:
synthetic_data, synthetic_labels = generate_all_classes(G, n_per_class=999)
print(f"Synthetic Data Shape: {synthetic_data.shape}")
print(f"Synthetic Labels Shape: {synthetic_labels.shape}")
extractor = MFCCFeatureExtractor()

Synthetic Data Shape: torch.Size([9990, 66000])
Synthetic Labels Shape: torch.Size([9990])


In [7]:
feature_extraction = np.zeros((9990, 61, 33))
for i, segment in enumerate(synthetic_data):
    if isinstance(segment, torch.Tensor):
        segment = segment.cpu().numpy()
    combined_features = extractor._extract_segment_features(segment, sr=16000)
    feature_extraction[i] = combined_features
print(feature_extraction.shape)

(9990, 61, 33)


In [None]:
device = torch.device('cuda')
raw_features = np.load("features.npy", allow_pickle=True)
features = np.stack([f.astype(np.float32) for f in raw_features])
labels = np.load("labels.npy")
real_features = torch.as_tensor(features, device=device)
real_labels = torch.as_tensor(labels, device=device)

In [None]:
device = torch.device('cuda')
feature_tensor = torch.as_tensor(feature_extraction, device=device)
real_features = real_features.to(device)
real_labels = real_labels.to(device)
combined_features = torch.cat([real_features, feature_tensor], dim=0)
combined_labels = torch.cat([real_labels, synthetic_labels], dim=0)

In [14]:
features_np = combined_features.cpu().numpy()
labels_np = combined_labels.cpu().numpy()
np.save('wav_CGAN_features.npy', features_np)
np.save('wav_CGAN_labels.npy', labels_np)