# Transfer Learning:  CNN 1D + CNN 2D 
Goal: use already trained architectures to train a new classifier that gets information both from 1D and 2D data (raw audio + mel spectrogram).
To do list:

- Load the trained models and their weights
- Set the model to evaluation mode and freeze their parameters in order not to track their gradients. We want to just use the models and train the classifier only
- Build the new model:

    - Extract CNN layers from NNET1D and NNET2D
    - Create a new MixNet class that import these layers and adds a fully-connected block to their outputs.
    
     

## Import libraries

In [2]:
from utils_mix import NNET1D, LN1D, NNET2, LN2D
import numpy as np
import torchvision.transforms.v2 as v2
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader
import os 
from torch.optim import Adadelta
import pytorch_lightning as pl
import pickle
import utils
from utils_mgr import DataAudio, create_subset, MinMaxScaler
import os



2024-01-15 15:18:47.496701: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


## Load trained architectures 


In [3]:

# Load model weights from checkpoint
CKPT_PATH_1D = "./1Dcheckpoint_Albi.ckpt"
CKPT_PATH_2D = "../lightning_logs_2D_final/version_4/checkpoints/epoch=69-step=7000.ckpt"
nnet1d = LN1D.load_from_checkpoint(checkpoint_path=CKPT_PATH_1D).eval()
nnet2d = LN2D.load_from_checkpoint(checkpoint_path=CKPT_PATH_2D).eval()


Network initialized
Using default optimizer parameters
Network initialized
Using default optimizer parameters
optimzier parameters: Adadelta (
Parameter Group 0
    differentiable: False
    eps: 1e-06
    foreach: None
    lr: 1.0
    maximize: False
    rho: 0.9
    weight_decay: 0
)


## Freeze the weights
Setting `requires_grad = False` will stop Pytorch to tracking the gradients of the CNNs layers


In [4]:
# Freeze the weights
for param in nnet1d.parameters():
    param.requires_grad = False
    
for param in nnet2d.parameters():
    param.requires_grad = False

## Define Dataset 
A new `Dataset` object is needed. This new class will import the audio file, select a random windows of $2^{18}$ samples and return:
1. `audio`: window of raw audio. This will be the input of the 1D-CNN
2. `mel`: mel spectrogram of the extracted window. This will be the input of the 2D-CNN
3. `label`: one-hot encoded label to be predicted from the new classifier

In [5]:
from torch.utils.data import Dataset
import warnings
import librosa
from utils_mgr import getAudio

class DataAudio_double(Dataset):

    def __init__(self, df, transform = None, type = "1D"):
        
        # Get track index
        self.track_ids = df['index'].values

        #Get genre label
        self.label = df['labels'].values

        #Transform
        self.transform = transform

        #Select type of input
        self.type = type

    def __len__(self):

        return len(self.track_ids)


    def create_input(self, i):
      
        # Get audio

        # load audio track
        #with warnings.catch_warnings():
        #    warnings.simplefilter('ignore')

        
        with warnings.catch_warnings():
            warnings.simplefilter('ignore')
            audio, sr = getAudio(self.track_ids[i])

            #Select random clip from audio
            start = np.random.randint(0, (audio.shape[0]-2**18))
            audio = audio[start:start+2**18]
            
        
            #Get 2D spectrogram
            stft = np.abs(librosa.stft(audio, n_fft=4096, hop_length=2048))
            
            mel = librosa.feature.melspectrogram(sr=sr, S=stft**2, n_mels=513)[:,:128]
            mel = librosa.power_to_db(mel, ref=np.max).T
    
        
            return audio[np.newaxis,:], mel
        
            

    def __getitem__(self, idx):

        # get input and label
        try:
            audio,mel = self.create_input(idx)
            y = self.label[idx] 
        except:
            print("\nNot able to load track number ", self.track_ids[idx], " Loading next one\n")
            audio,mel = self.create_input(idx+1)
            y = self.label[idx]
        

        if self.transform:
            mel = self.transform(mel)
           
        return audio,mel,y



The following function is used to load data, pass it through `DataAudio_double` class that applies transformations and create the `DataLoaders`.

In [6]:
import os
# to visualize logs

def import_and_preprocess_data(architecture_type="1D"):

    os.chdir("../.")
    files = os.listdir()
    #print(files)
    """
    This function uses metadata contained in tracks.csv to import mp3 files,
    pass them through DataAudio class and eventually create Dataloaders.  
    
    """
    # Load metadata and features.
    tracks = utils.load('data/fma_metadata/tracks.csv')

    #Select the desired subset among the entire dataset
    sub = 'small'
    raw_subset = tracks[tracks['set', 'subset'] <= sub] 
    
    #Creation of clean subset for the generation of training, test and validation sets
    meta_subset= create_subset(raw_subset)

    # Remove corrupted files
    corrupted = [98565, 98567, 98569, 99134, 108925, 133297]
    meta_subset = meta_subset[~meta_subset['index'].isin(corrupted)]

    #Split between taining, validation and test set according to original FMA split

    train_set = meta_subset[meta_subset["split"] == "training"]
    val_set   = meta_subset[meta_subset["split"] == "validation"]
    test_set  = meta_subset[meta_subset["split"] == "test"]

    # Standard transformations for images

    # There are two ways to normalize data: 
    #   1. Using  v2.Normalize(mean=[1.0784853], std=[4.0071154]). These values are computed with utils_mgr.mean_computer() function.
    #   2. Using v2.Lambda and MinMaxScaler. This function is implemented in utils_mgr and resambles sklearn homonym function.

    transforms = v2.Compose([v2.ToTensor(),
        v2.RandomResizedCrop(size=(128,513), antialias=True), # Data Augmentation
        v2.RandomHorizontalFlip(p=0.5), # Data Augmentation
        v2.ToDtype(torch.float32, scale=True),
        #v2.Normalize(mean=[1.0784853], std=[4.0071154]),
        v2.Lambda(lambda x: MinMaxScaler(x)) # see utils_mgr
        ])

    # Create the datasets and the dataloaders
    """  
    train_dataset    = DataAudio_double(train_set, transform = transforms,type=architecture_type)
    train_dataloader = DataLoader(train_dataset, batch_size=64, shuffle=True, num_workers=os.cpu_count())
    """
    val_dataset      = DataAudio_double(val_set, transform = transforms,type=architecture_type)
    val_dataloader   = DataLoader(val_dataset, batch_size=64, shuffle=False, num_workers=os.cpu_count())
    """
    test_dataset     = DataAudio_double(test_set, transform = transforms,type=architecture_type)
    test_dataloader  = DataLoader(test_dataset, batch_size=64, shuffle=False, num_workers=os.cpu_count())
    """

    #return train_dataloader, val_dataloader, test_dataloader
    return val_dataloader, val_dataloader, val_dataloader

In [7]:
train_dataloader, val_dataloader, test_dataloader = import_and_preprocess_data(architecture_type="2D")

  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


Checking if the shapes are what is expected

In [8]:
for batch in train_dataloader:
    print("batch[0]",batch[0].shape)
    print("batch[1]",batch[1].shape)
    print("batch[2]",batch[2].shape)
    break

batch[0] torch.Size([64, 1, 262144])
batch[1] torch.Size([64, 1, 128, 513])
batch[2] torch.Size([64, 8])


## Extracting convolutional layers

In [9]:
import torch.nn as nn

# Get all convolutional layers from nnet2d
conv_layers = [layer for layer in nnet2d.modules() if isinstance(layer, nn.Conv2d) or isinstance(layer, nn.BatchNorm2d) or isinstance(layer, nn.ReLU)]

# Build a new convolutional layer
conv_block2D = nn.Sequential(*conv_layers[:9]) # [:9] to remove redundat ReLU layers taken by mistake from fc layers

# Print the new convolutional layer
print(conv_block2D)


Sequential(
  (0): Conv2d(1, 256, kernel_size=(4, 513), stride=(1, 1))
  (1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (2): ReLU()
  (3): Conv2d(256, 256, kernel_size=(4, 1), stride=(1, 1), padding=(2, 0))
  (4): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (5): ReLU()
  (6): Conv2d(256, 256, kernel_size=(4, 1), stride=(1, 1), padding=(1, 0))
  (7): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (8): ReLU()
)


In [10]:
import torch.nn as nn

# Get all convolutional layers from nnet1d
conv_layers = [layer for layer in nnet1d.modules() if isinstance(layer, nn.Conv1d) or isinstance(layer, nn.BatchNorm1d) or isinstance(layer, nn.ReLU)]

# Build a new convolutional layer
conv_block1D = nn.Sequential(*conv_layers[:12])

# Print the new convolutional layer
print(conv_block1D)


Sequential(
  (0): Conv1d(1, 16, kernel_size=(128,), stride=(32,), padding=(64,))
  (1): BatchNorm1d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (2): ReLU(inplace=True)
  (3): Conv1d(16, 32, kernel_size=(32,), stride=(2,), padding=(16,))
  (4): BatchNorm1d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (5): ReLU(inplace=True)
  (6): Conv1d(32, 64, kernel_size=(16,), stride=(2,), padding=(8,))
  (7): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (8): ReLU(inplace=True)
  (9): Conv1d(64, 128, kernel_size=(8,), stride=(2,), padding=(4,))
  (10): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (11): ReLU(inplace=True)
)


### Check that the weights are the same

In [11]:
# Print weights of the first conv layer in nnet1d
for module in nnet2d.modules():
    if isinstance(module, torch.nn.Conv2d):
        print(module.weight[0])
     
        break

# Print weights of the first conv layer in conv_block2D
for module in conv_block2D.modules():
    
    if  isinstance(module, torch.nn.Conv2d):
        print(module.weight[0])
        break

tensor([[[-0.0044, -0.0048, -0.0087,  ..., -0.0131, -0.0143, -0.0064],
         [-0.0104, -0.0012, -0.0093,  ..., -0.0091, -0.0018, -0.0065],
         [-0.0071, -0.0116, -0.0176,  ...,  0.0010, -0.0042,  0.0029],
         [-0.0152, -0.0159, -0.0148,  ..., -0.0076, -0.0033, -0.0023]]])
tensor([[[-0.0044, -0.0048, -0.0087,  ..., -0.0131, -0.0143, -0.0064],
         [-0.0104, -0.0012, -0.0093,  ..., -0.0091, -0.0018, -0.0065],
         [-0.0071, -0.0116, -0.0176,  ...,  0.0010, -0.0042,  0.0029],
         [-0.0152, -0.0159, -0.0148,  ..., -0.0076, -0.0033, -0.0023]]])


## Define the new model `MixNet`
**For the real final architecture look at the .py files.** At the end there may be many less layers.

In [12]:
class MixNet(nn.Module):
    def __init__(self, conv_block1D, conv_block2D):
        super(MixNet, self).__init__()
        self.conv_block1D = conv_block1D
        self.conv_block2D = conv_block2D

        self.dropout = nn.Dropout(p=0.5)  # Add dropout layer

        self.classifier = nn.Sequential(
            nn.Linear(512+2048, 512),
            nn.ReLU(),
            self.dropout, 
            nn.Linear(512, 256),
            nn.ReLU(),
            self.dropout, 
            nn.Linear(256, 128),
            nn.ReLU(),
            self.dropout,  
            nn.Linear(128, 8),
            nn.Softmax(dim=1)
        )

    def forward(self, x):
        audio = x[0]
        mel   = x[1]
        
        conv2d = self.conv_block2D(mel)
        max_pool = F.max_pool2d(conv2d, kernel_size=(125,1))
        avg_pool = F.avg_pool2d(conv2d, kernel_size=(125,1))
        cat2d = torch.cat([max_pool,avg_pool],dim=1)
        cat2d = cat2d.view(cat2d.size(0), -1) # cat2d shape torch.Size([1, 512])
        
        conv1d = self.conv_block1D(audio)
        max_pool = F.max_pool1d(conv1d, kernel_size=125)
        avg_pool = F.avg_pool1d(conv1d, kernel_size=125)
        cat1d = torch.cat([max_pool,avg_pool],dim=1)
        cat1d = cat1d.view(cat1d.size(0), -1) # cat1d dim = torch.Size([batch_size, 2048])

        # Concatanate the two outputs and pass it to the classifier
        # cat1d dim = torch.Size([batch_size, 2048])
        # cat2d dim = torch.Size([batch_size, 512])
        x = torch.cat([cat1d, cat2d], dim=1) 
        x = self.dropout(x)  # Add dropout layer
        x = self.classifier(x)
        return x

In [13]:
mn = MixNet(conv_block1D, conv_block2D)

In [14]:
mn

MixNet(
  (conv_block1D): Sequential(
    (0): Conv1d(1, 16, kernel_size=(128,), stride=(32,), padding=(64,))
    (1): BatchNorm1d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU(inplace=True)
    (3): Conv1d(16, 32, kernel_size=(32,), stride=(2,), padding=(16,))
    (4): BatchNorm1d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (5): ReLU(inplace=True)
    (6): Conv1d(32, 64, kernel_size=(16,), stride=(2,), padding=(8,))
    (7): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (8): ReLU(inplace=True)
    (9): Conv1d(64, 128, kernel_size=(8,), stride=(2,), padding=(4,))
    (10): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (11): ReLU(inplace=True)
  )
  (conv_block2D): Sequential(
    (0): Conv2d(1, 256, kernel_size=(4, 513), stride=(1, 1))
    (1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU()
    (3)

Check if everything is alright

In [15]:
for batch in train_dataloader:
    input = batch[:2]
    print(input[0].shape) # Audio shape: torch.Size([64, 1, 262144])
    label = batch[2]
    print(label.shape) # Label shape: torch.Size([64, 8])
    output = mn(input)
    print(output) 

    break

torch.Size([64, 1, 262144])
torch.Size([64, 8])
tensor([[0.1200, 0.1088, 0.1244, 0.1430, 0.1169, 0.1292, 0.1228, 0.1349],
        [0.1210, 0.1244, 0.1332, 0.1168, 0.1430, 0.1172, 0.1325, 0.1119],
        [0.1187, 0.1250, 0.1277, 0.1204, 0.1334, 0.1245, 0.1264, 0.1239],
        [0.1228, 0.1207, 0.1244, 0.1215, 0.1283, 0.1322, 0.1245, 0.1254],
        [0.1279, 0.1271, 0.1230, 0.1276, 0.1220, 0.1245, 0.1353, 0.1126],
        [0.1090, 0.1376, 0.1285, 0.1236, 0.1366, 0.1204, 0.1224, 0.1221],
        [0.1315, 0.1065, 0.1329, 0.1279, 0.1306, 0.1244, 0.1341, 0.1121],
        [0.1137, 0.1078, 0.1246, 0.1274, 0.1394, 0.1318, 0.1350, 0.1202],
        [0.1119, 0.1250, 0.1199, 0.1252, 0.1361, 0.1226, 0.1271, 0.1321],
        [0.1230, 0.1397, 0.1282, 0.1228, 0.1261, 0.1223, 0.1165, 0.1214],
        [0.1121, 0.1386, 0.1148, 0.1277, 0.1317, 0.1302, 0.1227, 0.1223],
        [0.0990, 0.1255, 0.1268, 0.1365, 0.1427, 0.1420, 0.1138, 0.1137],
        [0.1188, 0.1246, 0.1287, 0.1283, 0.1333, 0.1191, 0.1322,

## Only classifier layers need grad

In [16]:
for name, param in mn.named_parameters():
    print(f'{name}: {param.requires_grad}')


conv_block1D.0.weight: False
conv_block1D.0.bias: False
conv_block1D.1.weight: False
conv_block1D.1.bias: False
conv_block1D.3.weight: False
conv_block1D.3.bias: False
conv_block1D.4.weight: False
conv_block1D.4.bias: False
conv_block1D.6.weight: False
conv_block1D.6.bias: False
conv_block1D.7.weight: False
conv_block1D.7.bias: False
conv_block1D.9.weight: False
conv_block1D.9.bias: False
conv_block1D.10.weight: False
conv_block1D.10.bias: False
conv_block2D.0.weight: False
conv_block2D.0.bias: False
conv_block2D.1.weight: False
conv_block2D.1.bias: False
conv_block2D.3.weight: False
conv_block2D.3.bias: False
conv_block2D.4.weight: False
conv_block2D.4.bias: False
conv_block2D.6.weight: False
conv_block2D.6.bias: False
conv_block2D.7.weight: False
conv_block2D.7.bias: False
classifier.0.weight: True
classifier.0.bias: True
classifier.3.weight: True
classifier.3.bias: True
classifier.6.weight: True
classifier.6.bias: True
classifier.9.weight: True
classifier.9.bias: True
