In [1]:
import torch
import torchaudio
from nnAudio import features
from torch import nn
from torch.nn import functional as F
from torch.utils.data import Dataset, DataLoader
import lightning.pytorch as pl
from lightning.pytorch.loggers import TensorBoardLogger
import os
import matplotlib.pyplot as plt
import pandas as pd
from IPython.display import Audio
from IPython.display import clear_output, display
import librosa
import math
import datetime
import time
import numpy as np
import random

#Eigene Sachen
from train import *
from data.data import *
from util.util import *
from util.helper import *
from util.datasets import *
from util.audio_processing import *
import util.plot as plot
import util.metric as metric
import models
import tqdm

c:\Program Files\Python39\lib\site-packages\numpy\.libs\libopenblas.FB5AE2TYXYH2IJRDKGDGQ3XBKLKTF43H.gfortran-win_amd64.dll
c:\Program Files\Python39\lib\site-packages\numpy\.libs\libopenblas64__v0.3.21-gcc_10_3_0.dll


In [2]:
from models.Simple import *
from models.Simple.SimpleLightningBase import *
from models.Simple.DeepCNN import *
from torchaudio.transforms import MelScale

In [10]:
class CNN(SimpleLightningBase):
    def __init__(self, channels = 32) -> None:
        
        super().__init__()

            
        self.conv1 = nn.Conv1d(in_channels = 1, out_channels = channels, kernel_size = 160, stride = 4)
        self.bn1   = nn.BatchNorm1d(channels)
        self.pool1 = nn.MaxPool1d(4)
        self.conv2 = nn.Conv1d(in_channels = channels, out_channels = channels, kernel_size = 4,  stride = 1)
        self.bn2   = nn.BatchNorm1d(channels)
        self.pool2 = nn.MaxPool1d(2)
        self.conv3 = nn.Conv1d(in_channels = channels, out_channels = channels, kernel_size = 2,  stride = 1)
        self.bn3   = nn.BatchNorm1d(channels)
        self.pool3 = nn.MaxPool1d(2)
        self.fc1   = nn.Linear(channels,1)
    
    def forward(self, x):
        
        #reshape
        out = x.unsqueeze(-2)

        #forward
        out = self.pool1(torch.relu(self.bn1(self.conv1(out))))
        out = self.pool2(torch.relu(self.bn2(self.conv2(out))))
        out = self.pool3(torch.relu(self.bn3(self.conv3(out))))

        out = torch.avg_pool1d(out,out.size(-1).item()).squeeze()

        out = self.fc1(out)
        #out = torch.sigmoid(out)
        return out.squeeze()

In [9]:
model_32_channels = CNN(channels=32)
model_64_channels = CNN(channels=64)

pd.DataFrame({
    "32 Channels" : {
        "256 hop_length"     : get_model_flops(model_32_channels, duration=1, sample_length = 512, hop_length = 256, context_length = 0).total()     / 10e+6,
        "512 hop_length"     : get_model_flops(model_32_channels, duration=1, sample_length = 512, hop_length = 512, context_length = 0).total()     / 10e+6,
        "512 context_length" : get_model_flops(model_32_channels, duration=1, sample_length = 512, hop_length = 512, context_length = 512).total()   / 10e+6,
        "1536 context_length": get_model_flops(model_32_channels, duration=1, sample_length = 512, hop_length = 512, context_length = 1536).total()  / 10e+6,
        "3584 context_length": get_model_flops(model_32_channels, duration=1, sample_length = 512, hop_length = 512, context_length = 3584).total()  / 10e+6,
    },
    
    "64 Channels" : {
        "256 hop_length"     : get_model_flops(model_64_channels, duration=1, sample_length = 512, hop_length = 256, context_length = 0).total()      / 10e+6,
        "512 hop_length"     : get_model_flops(model_64_channels, duration=1, sample_length = 512, hop_length = 512, context_length = 0).total()      / 10e+6,
        "512 context_length" : get_model_flops(model_64_channels, duration=1, sample_length = 512, hop_length = 512, context_length = 512).total()    / 10e+6,
        "1536 context_length": get_model_flops(model_64_channels, duration=1, sample_length = 512, hop_length = 512, context_length = 1536).total()   / 10e+6,
        "3584 context_length": get_model_flops(model_64_channels, duration=1, sample_length = 512, hop_length = 512, context_length = 3584).total()   / 10e+6,
    }
})

Unsupported operator aten::add_ encountered 3 time(s)
Unsupported operator aten::max_pool1d encountered 3 time(s)
Unsupported operator aten::avg_pool1d encountered 1 time(s)
The following submodules of the model were never called during the trace of the graph. They may be unused, or they were accessed by direct calls to .forward() or via other python methods. In the latter case they will have zeros for statistics, though their statistics will still contribute to their parent calling module.
accuracy
Unsupported operator aten::add_ encountered 3 time(s)
Unsupported operator aten::max_pool1d encountered 3 time(s)
Unsupported operator aten::avg_pool1d encountered 1 time(s)
The following submodules of the model were never called during the trace of the graph. They may be unused, or they were accessed by direct calls to .forward() or via other python methods. In the latter case they will have zeros for statistics, though their statistics will still contribute to their parent calling module.

Unnamed: 0,32 Channels,64 Channels
256 hop_length,3.524576,8.217331
512 hop_length,1.762288,4.108666
512 context_length,4.389104,10.378106
1536 context_length,9.642736,22.916986
3584 context_length,20.15,47.994746


In [5]:
class FFTCNN(SimpleLightningBase):
    def __init__(self, fft_window = torch.hann_window(512), first_kernel_size = 16, kernel_size = 16, mid_channels=32, last_channels=32, n_blocks = 1, dense_features = 32) -> None:

        #Super
        super().__init__()

        #FFT
        self.fft = FFT( window = fft_window, low_treshold = -100)

        #First Layer
        self.first_cnn_layer = Block( in_channels = 1, out_channels = mid_channels, kernel_size = first_kernel_size, stride = 1, bn = False)

        #Blocks
        self.block_list = nn.ModuleList()
        for i in range(n_blocks):
            self.block_list.append( Block (
                in_channels  = mid_channels,
                out_channels = mid_channels,
                kernel_size  = kernel_size
            ))
        
        #Last CNN Layer
        self.last_cnn_layer = self.last_cnn_layer = Block(in_channels = mid_channels, out_channels = last_channels, kernel_size = kernel_size, stride = 1)

        #Dense
        self.fc1 = nn.Linear(last_channels, dense_features)
        self.bn1 = nn.BatchNorm1d(last_channels)
        self.fc2 = nn.Linear(dense_features, 1)
    
    def forward(self, x):

        #Out
        out = x

        #FFT
        out = self.fft(out)

        #Reshape
        out = out.unsqueeze(1)

        #First Layer
        out = self.first_cnn_layer(out)
        
        #Blocks
        for block in self.block_list:
            out = block(out)
        
        #Last CNN Layer
        out = self.last_cnn_layer(out)

        #Flatten
        out = torch.avg_pool1d(out, out.size(-1).item())
        out = out.squeeze()

        #Dense
        out = self.fc1(out)
        out = torch.relu(out)
        out = self.bn1(out)
        
        out = self.fc2(out)
        #out = torch.sigmoid(out)
        out = out.squeeze()
        
        return out


In [6]:
class MELCNN(SimpleLightningBase):
    def __init__(self, fft_window = torch.hann_window(512), n_mels = 64, first_kernel_size = 16, kernel_size = 16, mid_channels=32, last_channels=32, n_blocks = 1, dense_features = 32, sr = None) -> None:

        #Super
        super().__init__()

        #FFT
        self.fft = FFT( window = fft_window, low_treshold = -100)

        #MEL
        self.mel_transformer = MelScale(n_mels = n_mels, n_stft = fft_window.size(0) // 2 + 1, sample_rate = sr)

        #First Layer
        self.first_cnn_layer = Block( in_channels = 1, out_channels = mid_channels, kernel_size = first_kernel_size, stride = 1, bn = False)

        #Blocks
        self.block_list = nn.ModuleList()
        for i in range(n_blocks):
            self.block_list.append( Block (
                in_channels  = mid_channels,
                out_channels = mid_channels,
                kernel_size  = kernel_size
            ))
        
        #Last CNN Layer
        self.last_cnn_layer = self.last_cnn_layer = Block(in_channels = mid_channels, out_channels = last_channels, kernel_size = kernel_size, stride = 1)

        #Dense
        self.fc1 = nn.Linear(last_channels, dense_features)
        self.bn1 = nn.BatchNorm1d(last_channels)
        self.fc2 = nn.Linear(dense_features, 1)
    
    def forward(self, x):

        #Out
        out = x

        #FFT
        out = self.fft(out)

        #MEL
        out = self.mel_transformer( out.unsqueeze(-1) ).squeeze()

        #Reshape
        out = out.unsqueeze(1)

        #First Layer
        out = self.first_cnn_layer(out)
        
        #Blocks
        for block in self.block_list:
            out = block(out)
        
        #Last CNN Layer
        out = self.last_cnn_layer(out)

        #Flatten
        out = torch.avg_pool1d(out, out.size(-1).item())
        out = out.squeeze()

        #Dense
        out = self.fc1(out)
        out = torch.relu(out)
        out = self.bn1(out)
        
        out = self.fc2(out)
        #out = torch.sigmoid(out)
        out = out.squeeze()
        
        return out

In [13]:
fft_model_512  = FFTCNN(fft_window = torch.hann_window(512))
fft_model_1024 = FFTCNN(fft_window = torch.hann_window(1024))
fft_model_2048 = FFTCNN(fft_window = torch.hann_window(2048))
fft_model_4096 = FFTCNN(fft_window = torch.hann_window(4096))

mel_64_model_512  = MELCNN(fft_window = torch.hann_window(512) , n_mels = 64, sr= SAMPLE_RATE)
mel_64_model_1024 = MELCNN(fft_window = torch.hann_window(1024), n_mels = 64, sr= SAMPLE_RATE)
mel_64_model_2048 = MELCNN(fft_window = torch.hann_window(2048), n_mels = 64, sr= SAMPLE_RATE)
mel_64_model_4096 = MELCNN(fft_window = torch.hann_window(4096), n_mels = 64, sr= SAMPLE_RATE)

mel_32_model_512  = MELCNN(fft_window = torch.hann_window(512) , n_mels = 32, sr= SAMPLE_RATE)
mel_32_model_1024 = MELCNN(fft_window = torch.hann_window(1024), n_mels = 32, sr= SAMPLE_RATE)
mel_32_model_2048 = MELCNN(fft_window = torch.hann_window(2048), n_mels = 32, sr= SAMPLE_RATE)
mel_32_model_4096 = MELCNN(fft_window = torch.hann_window(4096), n_mels = 32, sr= SAMPLE_RATE)

pd.DataFrame({
    "FFTCNN" : {
        "256 hop_length"     : get_model_flops(fft_model_512,  duration=1, sample_length = 512, hop_length = 256, context_length = 0).total()       / 10e+6,
        "512 hop_length"     : get_model_flops(fft_model_512,  duration=1, sample_length = 512, hop_length = 512, context_length = 0).total()       / 10e+6,
        "512 context_length" : get_model_flops(fft_model_1024, duration=1, sample_length = 512, hop_length = 512, context_length = 512).total()     / 10e+6,
        "1536 context_length": get_model_flops(fft_model_2048, duration=1, sample_length = 512, hop_length = 512, context_length = 1536).total()    / 10e+6,
        "3584 context_length": get_model_flops(fft_model_2048, duration=1, sample_length = 512, hop_length = 512, context_length = 1536).total()    / 10e+6,
    },

    "MELCNN" : {
        "256 hop_length"     : get_model_flops(mel_64_model_512,  duration=1, sample_length = 512, hop_length = 256, context_length = 0).total()    / 10e+6,
        "512 hop_length"     : get_model_flops(mel_64_model_512,  duration=1, sample_length = 512, hop_length = 512, context_length = 0).total()    / 10e+6,
        "512 context_length" : get_model_flops(mel_64_model_1024, duration=1, sample_length = 512, hop_length = 512, context_length = 512).total()  / 10e+6,
        "1536 context_length": get_model_flops(mel_64_model_2048, duration=1, sample_length = 512, hop_length = 512, context_length = 1536).total() / 10e+6,
        "3584 context_length": get_model_flops(mel_64_model_4096, duration=1, sample_length = 512, hop_length = 512, context_length = 3584).total() / 10e+6,
    },

    "MELCNN_32" : {
        "256 hop_length"     : get_model_flops(mel_32_model_512,  duration=1, sample_length = 512, hop_length = 256, context_length = 0).total()    / 10e+6,
        "512 hop_length"     : get_model_flops(mel_32_model_512,  duration=1, sample_length = 512, hop_length = 512, context_length = 0).total()    / 10e+6,
        "512 context_length" : get_model_flops(mel_32_model_1024, duration=1, sample_length = 512, hop_length = 512, context_length = 512).total()  / 10e+6,
        "1536 context_length": get_model_flops(mel_32_model_2048, duration=1, sample_length = 512, hop_length = 512, context_length = 1536).total() / 10e+6,
        "3584 context_length": get_model_flops(mel_32_model_4096, duration=1, sample_length = 512, hop_length = 512, context_length = 3584).total() / 10e+6,
    },
})

Unsupported operator aten::mul encountered 6 time(s)
Unsupported operator aten::fft_rfft encountered 1 time(s)
Unsupported operator aten::abs encountered 3 time(s)
Unsupported operator aten::sum encountered 1 time(s)
Unsupported operator aten::div encountered 2 time(s)
Unsupported operator aten::log10 encountered 1 time(s)
Unsupported operator aten::lt encountered 1 time(s)
Unsupported operator aten::add encountered 7 time(s)
Unsupported operator aten::sub encountered 12 time(s)
Unsupported operator aten::pad encountered 3 time(s)
Unsupported operator aten::add_ encountered 3 time(s)
Unsupported operator aten::avg_pool1d encountered 1 time(s)
The following submodules of the model were never called during the trace of the graph. They may be unused, or they were accessed by direct calls to .forward() or via other python methods. In the latter case they will have zeros for statistics, though their statistics will still contribute to their parent calling module.
accuracy
Unsupported operat

Unnamed: 0,FFTCNN,MELCNN,MELCNN_32
256 hop_length,53.545779,13.441997,6.724768
512 hop_length,26.77289,6.720998,3.362384
512 context_length,53.43785,6.771789,3.387779
1536 context_length,106.76777,6.87337,3.43857
3584 context_length,106.76777,7.076531,3.54015
