# <center> Convolutional layer size calculations

Set of utilities to show "sizes" of a convolutional layer, including out shape and number of trainable parameters

In [1]:
import torch
from torch.utils.data import Dataset

import torchaudio
import torchaudio.transforms

import torchvision


import sys, os

from pprint import pprint

from tqdm.autonotebook import tqdm

import json

import numpy as np

import matplotlib.pylab as plt
import seaborn as sns

import librosa
import librosa.display

import pandas as pd

from pathlib import Path

import gc

MANUAL_SEED = 69

import torch.nn as nn
import torch.nn.functional as F

import torch.optim as optim

from datetime import date
from datetime import datetime

import os.path
from os import path
  
import json

import time

import copy

from matplotlib import pyplot as plt
plt.rcParams['figure.dpi'] = 200
plt.rcParams['savefig.dpi'] = 200

from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler

from sklearn.model_selection import KFold

import random

from pprint import pformat

import math
import pathlib

In [2]:
import math

In [3]:
def calculate_output_length(length_in, kernel_size, stride=1, padding=0, dilation=1):
  return (
    length_in + 2 * padding - dilation * (kernel_size - 1) - 1
  ) // stride + 1

In [4]:
def calculate_pool_length(w, f, s):
  return math.floor( ( ( w - f ) / s ) + 1 )

In [5]:
def calculate_num_params_conv_layer(
  channels_in, kernel_width, kernel_height, channels_out
):
  return channels_in * kernel_width * kernel_height * channels_out + channels_out

In [6]:
# kernel_sizes = [64, 32, 16,   8,   4]
# pool_sizes   = [ 8,  8,  2,   2,   4]
# strides      = [ 3,  3,  2,   2,   2]
# pool_strides = [ 8,  8,  2,   4,   2]
# num_filters  = [16, 32, 64, 128,   6]

# kernel_sizes = [64, 32, 16,   8,   4]
# pool_sizes   = [ 8,  8,  2,   2,   4]
# strides      = [ 3,  3,  2,   2,   2]
# pool_strides = [ 8,  8,  2,   4,   2]
# num_filters  = [ 8, 16, 32,  64,   6]

NUM_LAYERS = len(kernel_sizes)


INITIAL_AUDIO_NUM_FRAMES = 238000 # --> 8 kHz
compr_out_size = INITIAL_AUDIO_NUM_FRAMES

channels_in = 1

tot_num_params = 0

for compr_id, kernel_size, stride, pool_size, pool_stride, channels_out in zip(
  range(NUM_LAYERS), 
  kernel_sizes, strides, 
  pool_sizes, pool_strides,
  num_filters
):
  
  compr_out_size = calculate_output_length(
    compr_out_size, kernel_size=kernel_size, stride=stride
  )
  
  compr_out_size = calculate_pool_length(compr_out_size, pool_size, pool_stride)

  num_params = calculate_num_params_conv_layer(
    channels_in=channels_in, kernel_width=kernel_size, kernel_height=1, 
    channels_out=channels_out
  )

  channels_in = channels_out
  
  print(f"compr_out_{compr_id}_out: {compr_out_size}, num_params: {num_params}")

  tot_num_params += num_params

print(f"\n\ntotal number of params: {tot_num_params}")


compr_out_0_out: 9914, num_params: 520
compr_out_1_out: 411, num_params: 4112
compr_out_2_out: 99, num_params: 8224
compr_out_3_out: 12, num_params: 16448
compr_out_4_out: 1, num_params: 1542


total number of params: 30846


In [7]:
class CNN(nn.Module):
  def __init__(
    self, 
    num_layers, 
    kernel_sizes, strides, 
    in_channels, num_filters,
    pool_sizes, pool_strides,
    dropout_p_conv, dropout_p_linear
  ):
    super().__init__()

    self.num_layers = num_layers 
    self.kernel_sizes = kernel_sizes 
    self.strides = strides 
    self.in_channels = in_channels 
    self.num_filters = num_filters
    self.pool_sizes = pool_sizes 
    self.pool_strides = pool_strides
    
    self.dropout_p_conv = dropout_p_conv
    self.dropout_p_linear = dropout_p_linear


    self.bns = {
      "1": nn.BatchNorm2d(num_features=1),
      "2": nn.BatchNorm2d(num_features=2),
      "4": nn.BatchNorm2d(num_features=4),
      "6": nn.BatchNorm2d(num_features=6),
      "8": nn.BatchNorm2d(num_features=8),
      "16": nn.BatchNorm2d(num_features=16),
      "32": nn.BatchNorm2d(num_features=32),
      "64": nn.BatchNorm2d(num_features=64),
      "128": nn.BatchNorm2d(num_features=128),
      "256": nn.BatchNorm2d(num_features=256),
      "512": nn.BatchNorm2d(num_features=512)
    }
    
    self.convs = nn.Sequential()
    
    for i in range(num_layers):
      
      conv_layer = nn.Conv2d(
        kernel_size=self.kernel_sizes[i],
        stride=self.strides[i],
        in_channels=in_channels,
        out_channels=self.num_filters[i]
      )
      torch.nn.init.xavier_uniform_(conv_layer.weight)

      
      pooling_layer = nn.MaxPool2d(
        kernel_size=self.pool_sizes[i],
        stride=self.pool_strides[i],
      )
      
      in_channels = self.num_filters[i]
      
      self.convs.add_module(name=f"conv_{i}", module=conv_layer)
      
      self.convs.add_module(name=f"pool_{i}", module=pooling_layer)
        
      self.convs.add_module(
        name=f"batchnorm_{i}", module=self.bns[str(self.num_filters[i])]
      )
      
      self.convs.add_module(name=f"activ_{i}", module=nn.ReLU())


    
  
  def forward(self, x):    
    x = self.convs(x)

    return x

  def get_model_setup(self):
    
    return {
      "num_layers": self.num_layers, 
      "kernel_sizes": self.kernel_sizes, 
      "strides": self.strides, 
      "in_channels": self.in_channels, 
      "num_filters": self.num_filters,
      "pool_sizes": self.pool_sizes, 
      "pool_strides": self.pool_strides,
      "dropout_p_conv": self.dropout_p_conv,
      "dropout_p_linear": self.dropout_p_linear,
    }

In [8]:
kernel_sizes =[ 2, 2,  2] #,  2]
pool_sizes   =[ 2, 2,  2] #,  2]
strides      =[ 2, 2,  1] #,  1]
pool_strides =[ 2, 2,  1] #,  1]
num_filters  =[ 1, 2,  4] #,  8]

num_layers = len(kernel_sizes)

in_channels   = 1 # we always use mono audio in both cases!

In [9]:
cnn = CNN(
  num_layers=num_layers,
  kernel_sizes=kernel_sizes, 
  strides=strides, 
  in_channels=in_channels, 
  num_filters=num_filters,
  pool_sizes=pool_sizes,
  pool_strides=pool_strides,
  dropout_p_conv=0.2,
  dropout_p_linear=0.5
)

In [10]:
test_cnn_input = torch.rand((16, 1, 128, 1860))
# test_cnn_input = torch.rand((16, 1, 128, 5157))

test_cnn_out = cnn(test_cnn_input)

test_cnn_out.shape

torch.Size([16, 4, 6, 114])