### REI505M Final project: Music genre classification starter pack

The following Dataset class operates on the GTZAN dataset.

* The duration of most GTZAN files are 30 seconds (3022050=661500 samples) but some are slightly shorter (approx 29.9 seconds). For this reason we truncate at 660000 samples below.
* It may be beneficial to work with smaller chunks than ~30 seconds.
* You may want to perform the data augmentations in the `__get_item__` function.
* For now, `train_dataset` contains all the dataset, you need to set aside some examples for validation and test sets.

In [None]:
import torch
from torch.utils.data import DataLoader
import torch.nn as nn

from src.Config import Config
from src.AudioDataset import AudioDataset
from src.DataPreparation import get_partitioned_data

In [None]:
config = Config(#Path to folder with GTZAN files:
                audio_dir_path='../music/',
                # music/
                #  - rock/
                #       rock.00099.wav
                #       ...
                #  - reggie/
                #  ...
                #  - blues/
                #Choose how many genres we want to use:
                num_genres=2, # eg. 2, 3, 5, 10
                #Data Partition
                train_part_size=0.7,
                val_part_size=0.15,
                test_part_size=0.15,
                batch_size=32, 
                learning_rate=1e-3,
                epochs=7, 
                seed=42)

torch.manual_seed(config.seed) # Reproducible results

In [None]:
#Load num_genres from data and partition them
train_files, train_labels, val_files, val_labels, test_files, test_labels = get_partitioned_data(config)

#Create Datasets and Dataloaders
train_dataset = AudioDataset(audio_files=train_files, labels=train_labels,
                             audio_path=config.audio_dir_path, 
                             maxlen=660000, sampling_rate=22050, duration=25)
train_loader = DataLoader(train_dataset, batch_size=config.batch_size, shuffle=True)

val_dataset = AudioDataset(val_files, val_labels, config.audio_dir_path,
                           maxlen=660000, sampling_rate=22050, duration=25)
val_loader = DataLoader(val_dataset, batch_size=config.batch_size, shuffle=False)

test_dataset = AudioDataset(test_files, test_labels, config.audio_dir_path,
                            maxlen=660000, sampling_rate=22050, duration=25)
test_loader = DataLoader(test_dataset, batch_size=config.batch_size, shuffle=False)                            


tmp_features, tmp_labels = next(iter(train_loader))
print(f"Feature batch shape: {tmp_features.size()}")
print(f"Labels batch shape: {tmp_labels.size()}")

In [None]:
class Conv1D(nn.Module):
    def __init__(self, in_c=1, out_c=64, k=7, use_pool=True, n_classes=10):
        super().__init__()
        pad = k // 2  
        self.conv = nn.Conv1d(in_c, out_c, kernel_size=k, stride=2, padding=pad)
        self.use_pool = use_pool
        self.pool = nn.MaxPool1d(kernel_size=2, stride=2)
        self.gap = nn.AdaptiveAvgPool1d(1)
        self.fc   = nn.Linear(out_c, n_classes)

    def forward(self, x):              
        x = nn.functional.relu(self.conv(x))       
        x = self.pool(x)           
        x = self.gap(x)       
        x = x.squeeze(-1)            
        x = self.fc(x)                
        return x


n_classes = config.num_genres 
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = Conv1D(in_c=1, out_c=64, k=7, use_pool=True, n_classes=n_classes).to(device)

opt = torch.optim.Adam(model.parameters(), config.learning_rate)
crit = nn.CrossEntropyLoss()

def batch_accuracy(logits, y):
    preds = logits.argmax(dim=1)
    return (preds == y).sum().item(), y.size(0)

model.train()
for epoch in range(config.epochs):
    running_loss = 0.0
    correct = 0.0
    total = 0.0
    for xb, yb in train_loader:
        if xb.dim() == 2:    
            xb = xb.unsqueeze(1)
        if yb.dtype != torch.long:
            yb = yb.long()

        xb, yb = xb.to(device), yb.to(device)

        opt.zero_grad()
        logits = model(xb)
        loss = crit(logits, yb)
        loss.backward()
        opt.step()

        running_loss += loss.item() * xb.size(0)
        c, t = batch_accuracy(logits, yb)
        correct += c
        total += t

    epoch_loss = running_loss / len(train_dataset)
    epoch_acc = correct / total
    print(f"epoch {epoch+1} | train loss {epoch_loss:.4f} | train acc {epoch_acc:.4f}")

 