# Quick Start Guide

This notebook provides a quick demonstration of the music classification pipeline without requiring actual datasets.

## 1. Check Installation

In [None]:
import sys
print(f"Python version: {sys.version}")

# Check required packages
required_packages = ['torch', 'torchaudio', 'numpy', 'matplotlib', 'librosa', 'sklearn']

for package in required_packages:
    try:
        __import__(package)
        print(f"âœ“ {package} is installed")
    except ImportError:
        print(f"âœ— {package} is NOT installed")
        print(f"   Install with: pip install {package}")

In [None]:
import torch
import torch.nn as nn
import torchaudio
import numpy as np
import matplotlib.pyplot as plt

print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"CUDA device: {torch.cuda.get_device_name(0)}")

## 2. Model Architecture Demo

Let's create and test model architectures with synthetic data.

In [None]:
class SimpleCNN(nn.Module):
    """Simple CNN for music genre classification."""
    
    def __init__(self, n_classes=10, sample_rate=22050, n_mels=128):
        super(SimpleCNN, self).__init__()
        
        self.sample_rate = sample_rate
        self.n_mels = n_mels
        
        # Mel-spectrogram transform
        self.mel_spec = torchaudio.transforms.MelSpectrogram(
            sample_rate=sample_rate,
            n_fft=2048,
            hop_length=512,
            n_mels=n_mels
        )
        self.amplitude_to_db = torchaudio.transforms.AmplitudeToDB()
        
        # Convolutional layers
        self.conv1 = nn.Conv2d(1, 32, kernel_size=3, padding=1)
        self.bn1 = nn.BatchNorm2d(32)
        self.pool1 = nn.MaxPool2d(2, 2)
        
        self.conv2 = nn.Conv2d(32, 64, kernel_size=3, padding=1)
        self.bn2 = nn.BatchNorm2d(64)
        self.pool2 = nn.MaxPool2d(2, 2)
        
        self.conv3 = nn.Conv2d(64, 128, kernel_size=3, padding=1)
        self.bn3 = nn.BatchNorm2d(128)
        self.pool3 = nn.MaxPool2d(2, 2)
        
        self.conv4 = nn.Conv2d(128, 256, kernel_size=3, padding=1)
        self.bn4 = nn.BatchNorm2d(256)
        self.pool4 = nn.MaxPool2d(2, 2)
        
        # Global average pooling
        self.global_pool = nn.AdaptiveAvgPool2d((1, 1))
        
        # Fully connected layers
        self.fc1 = nn.Linear(256, 128)
        self.dropout = nn.Dropout(0.5)
        self.fc2 = nn.Linear(128, n_classes)
    
    def forward(self, x):
        # Convert to mel-spectrogram
        x = self.mel_spec(x)
        x = self.amplitude_to_db(x)
        
        # Convolutional blocks
        x = self.pool1(torch.relu(self.bn1(self.conv1(x))))
        x = self.pool2(torch.relu(self.bn2(self.conv2(x))))
        x = self.pool3(torch.relu(self.bn3(self.conv3(x))))
        x = self.pool4(torch.relu(self.bn4(self.conv4(x))))
        
        # Global pooling and flatten
        x = self.global_pool(x)
        x = x.view(x.size(0), -1)
        
        # Fully connected layers
        x = torch.relu(self.fc1(x))
        x = self.dropout(x)
        x = self.fc2(x)
        
        return x

In [None]:
# Create model instance
model = SimpleCNN(n_classes=10)
print("Model created successfully!")
print(f"\nNumber of parameters: {sum(p.numel() for p in model.parameters()):,}")
print(f"Trainable parameters: {sum(p.numel() for p in model.parameters() if p.requires_grad):,}")

## 3. Test Model with Synthetic Data

In [None]:
# Create synthetic audio (random noise)
batch_size = 4
sample_rate = 22050
duration = 30  # seconds
n_samples = sample_rate * duration

# Generate random audio
synthetic_audio = torch.randn(batch_size, 1, n_samples)

print(f"Synthetic audio shape: {synthetic_audio.shape}")
print(f"Audio duration: {duration} seconds")
print(f"Sample rate: {sample_rate} Hz")

In [None]:
# Forward pass through the model
model.eval()
with torch.no_grad():
    output = model(synthetic_audio)

print(f"Input shape: {synthetic_audio.shape}")
print(f"Output shape: {output.shape}")
print(f"\nOutput logits (first sample): {output[0]}")

# Convert to probabilities
probabilities = torch.nn.functional.softmax(output, dim=1)
print(f"\nProbabilities (first sample): {probabilities[0]}")
print(f"Sum of probabilities: {probabilities[0].sum():.4f}")

## 4. Visualize Mel-Spectrogram

In [None]:
# Create mel-spectrogram for visualization
mel_transform = torchaudio.transforms.MelSpectrogram(
    sample_rate=sample_rate,
    n_fft=2048,
    hop_length=512,
    n_mels=128
)
amplitude_to_db = torchaudio.transforms.AmplitudeToDB()

# Take first sample
audio_sample = synthetic_audio[0]
mel_spec = mel_transform(audio_sample)
mel_spec_db = amplitude_to_db(mel_spec)

# Plot
fig, axes = plt.subplots(2, 1, figsize=(14, 8))

# Waveform
axes[0].plot(audio_sample[0].numpy())
axes[0].set_title('Waveform (Synthetic Audio)', fontsize=14)
axes[0].set_xlabel('Sample')
axes[0].set_ylabel('Amplitude')
axes[0].grid(True, alpha=0.3)

# Mel-spectrogram
im = axes[1].imshow(mel_spec_db[0].numpy(), aspect='auto', origin='lower', cmap='viridis')
axes[1].set_title('Mel-Spectrogram (dB)', fontsize=14)
axes[1].set_xlabel('Time Frames')
axes[1].set_ylabel('Mel Frequency Bins')
plt.colorbar(im, ax=axes[1], label='Amplitude (dB)')

plt.tight_layout()
plt.show()

## 5. Visualize Predictions

In [None]:
# GTZAN genre names
genre_names = ['blues', 'classical', 'country', 'disco', 'hiphop', 
               'jazz', 'metal', 'pop', 'reggae', 'rock']

# Get predictions for first sample
probs = probabilities[0].numpy()
predicted_idx = np.argmax(probs)

# Plot
fig, ax = plt.subplots(figsize=(10, 6))
bars = ax.barh(genre_names, probs, color='steelblue')

# Highlight predicted genre
bars[predicted_idx].set_color('coral')

ax.set_xlabel('Probability', fontsize=12)
ax.set_title('Genre Prediction Probabilities', fontsize=14, fontweight='bold')
ax.set_xlim([0, 1])
ax.grid(axis='x', alpha=0.3)

# Add value labels
for i, (name, prob) in enumerate(zip(genre_names, probs)):
    ax.text(prob + 0.01, i, f'{prob:.3f}', va='center')

plt.tight_layout()
plt.show()

print(f"\nPredicted genre: {genre_names[predicted_idx]}")
print(f"Confidence: {probs[predicted_idx]*100:.2f}%")

## 6. Model Summary

In [None]:
def count_parameters(model):
    """Count model parameters by layer."""
    total_params = 0
    print("Layer-wise Parameter Count:")
    print("-" * 60)
    
    for name, parameter in model.named_parameters():
        if not parameter.requires_grad:
            continue
        params = parameter.numel()
        total_params += params
        print(f"{name:40s} {params:>15,}")
    
    print("-" * 60)
    print(f"{'Total Parameters':40s} {total_params:>15,}")
    print(f"{'Model Size (MB)':40s} {total_params * 4 / (1024**2):>15.2f}")
    
    return total_params

count_parameters(model)

## Next Steps

Now that you've verified the installation and tested the model:

1. **Download datasets**: See README.md for dataset download instructions
2. **Load real data**: Use notebooks 01-03 to load GTZAN, MTAT, or FMA datasets
3. **Train models**: Use notebook 06 to train on real music data
4. **Evaluate**: Use notebook 07 for inference on new audio files

### Recommended Order:
1. `01_data_loading_gtzan.ipynb` - Start with GTZAN (smallest dataset)
2. `04_model_cnn.ipynb` - Explore CNN architectures
3. `05_model_rnn.ipynb` - Explore RNN architectures
4. `06_training.ipynb` - Train your first model
5. `07_inference.ipynb` - Make predictions

Happy music classification! ðŸŽµ