In [55]:
# !apt-get --purge remove cuda nvidia* libnvidia-*
# !dpkg -l | grep cuda- | awk '{print $2}' | xargs -n1 dpkg --purge
# !apt-get remove cuda-*
# !apt autoremove
# !apt-get update

In [56]:
# !wget https://developer.nvidia.com/compute/cuda/9.2/Prod/local_installers/cuda-repo-ubuntu1604-9-2-local_9.2.88-1_amd64 -O cuda-repo-ubuntu1604-9-2-local_9.2.88-1_amd64.deb
# !dpkg -i cuda-repo-ubuntu1604-9-2-local_9.2.88-1_amd64.deb
# !apt-key add /var/cuda-repo-9-2-local/7fa2af80.pub
# !apt-get update
# !apt-get install cuda-9.2

In [57]:
# !nvcc --version 

In [None]:
!pip install git+git://github.com/andreinechaev/nvcc4jupyter.git 

In [None]:
%load_ext nvcc_plugin
from google.colab import drive
drive.mount('/content/drive/')

In [60]:
!cp -r /content/drive/MyDrive/Colab_Notebooks/MBCONVS_float/ /content/MBCONVS_float

In [65]:
%%cuda --name KERNELS.cu 

#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <math.h>
#include <cuda.h>
#include <cuda_runtime_api.h>
#include "device_launch_parameters.h"
#include <cusolverDn.h>
#include <cuda_runtime.h>

#include "/content/MBCONVS_float/functionsV2.h"
#include "/content/MBCONVS_float/KERNELSH.h"

/* Kernel definitions */
__global__ void INPUT_UNROLLING(int stride, int Filter_Height,
                                float *Input, int H1, int W1, int D1,
                                float *X_unrolled, int H2, int W2, int D2,
                                int Output_Height, int Output_Width)
{  
    int bx = blockIdx.x, by = blockIdx.y, bz = blockIdx.z;
    int tx = threadIdx.x, ty = threadIdx.y;
 
    // Select row and column values 
    int row =  by * blockDim.y + ty;
    int col = bx * blockDim.x + tx;
    int depth = bz;
 
    int col_no_strided = col, row_no_strided = row;
    int depth_offset = depth * W2 * Filter_Height * Filter_Height;

    /* 
      Note for bx, by and bz= 0, stride = 2: 
          @ tx = 0, ty = 0 -> First multiply the col * stride, row * stride; = 0, 0
                            you are shifting in x direction using local col
                            you are shifting in y direction using local row;
          @ tx = 1, ty = 0 -> First multiply the col * stride, row * stride; = 2, 0 
                            you are shifting in x direction using local col
                            you are shifting in y direction using local row;   
          @ tx = 0, ty = 1 -> First multiply the col * stride, row * stride; = 0, 2 
                            you are shifting in x direction using local col
                            you are shifting in y direction using local row;                   
    */ 
  
    col *= stride; row *= stride;
 
    // Limit number of threads 
    if (row_no_strided < Output_Height && col_no_strided < Output_Width && depth < D1)
    {   
      // Each thread unrolls k x k elements
      for (int local_row = 0; local_row < Filter_Height; local_row++)
      {
        for (int local_col = 0; local_col < Filter_Height; local_col++)
        {                                  
          // 1. local row and column shifts affect the locations in Unrolled matrix
          // 2. For each col and row non strided values -> you are adding an offset to columns and rows in Unrolled matrix
          // 3. Offset the depth using "depth_offset" variable
          X_unrolled[local_col * W2 + local_row * Filter_Height * W2 + col_no_strided + row_no_strided * Output_Width + depth_offset] = 
          Input[(row + local_row) * W1 + (col + local_col) + depth * H1 * W1];
        }
      }
    }

}

__global__ void DWConv2d_kernel(float *Input, int H1, int W1, int D1,
                                float *Filter, int H2, int W2, int D2,
                                float *Output, int H3, int W3, int D3,
                                int stride)
{
    int bx = blockIdx.x;
    int by = blockIdx.y;
    int bz = blockIdx.z;

    int tx = threadIdx.x;
    int ty = threadIdx.y;

    int row = by * blockDim.y + ty;
    int col = bx * blockDim.x + tx;
    int dep = bz;

    float Pvalue = 0;

    if (row < H3 && col < W3 && dep < D3)
    {
      // 1 thread unrolls kxk section
      for (int j = 0; j < H2; j++)
      {
        for (int i = 0; i < W2; i++)
        {
            Pvalue += Filter[j * W2 + i + dep * H2 * W2] *
                Input[(j * W1 + row * stride * W1) + (i + col * stride) + dep * H1 * W1];
        }
      }
      Output[row * W3 + col + dep * H3 * W3] = Pvalue;
    }

}

__global__ void MatrixMulKernel(float *M, int H1, int W1, int D1,
                                float *N, int H2, int W2, int D2,
                                float *P, int H3, int W3, int D3,
                                int num_blocks, int activation, 
                                int IS_BIASED, float *bias_mat)
{
  __shared__ float Mds[Tile_GEMM][Tile_GEMM];
  __shared__ float Nds[Tile_GEMM][THREAD_GRANULARITY_BLOCKS * Tile_GEMM];

  int bx = blockIdx.x * THREAD_GRANULARITY_BLOCKS;
  int by = blockIdx.y;
  int tx = threadIdx.x;
  int ty = threadIdx.y;

  // Identify the row and column of the d_P element to work on
  int Row = by * Tile_GEMM + ty;
  int Col = bx * Tile_GEMM + tx;
  float Pvalue = 0;
  float Pvalue_2 = 0;

  // Loop over the d_M and d_N tiles required to compute d_P element
  for (int ph = 0; ph < num_blocks; ++ph)
  {
    // Collaborative loading of d_M and d_N tiles into shared memory
    if ((Row < H1) && (ph * Tile_GEMM + tx) < W1)
    {
      Mds[ty][tx] = M[Row * W1 + ph * Tile_GEMM + tx];
    }

    if ((ph * Tile_GEMM + ty) < H2 && Col < W2)
    {
      Nds[ty][tx] = N[(ph * Tile_GEMM + ty) * W2 + Col];
    }

    if ((ph * Tile_GEMM + ty) < H2 && Col + Tile_GEMM < W2)
    {
      Nds[ty][tx + Tile_GEMM] = N[(ph * Tile_GEMM + ty) * W2 + Col + Tile_GEMM];
    }     
   
    __syncthreads();

    for (int k = 0; k < Tile_GEMM && (ph * Tile_GEMM) + k < W1; ++k)
    {
      Pvalue += Mds[ty][k] * Nds[k][tx];
      if (Col + Tile_GEMM < W2)
        Pvalue_2 += Mds[ty][k] * Nds[k][tx + Tile_GEMM];
    }
  
    __syncthreads();

  }

  if ((Row < H1) && (Col < W2))
  {
    P[Row * W3 + Col] = Pvalue;
    
    switch (IS_BIASED) 
    {
      case BIASED:
        Pvalue = Pvalue + bias_mat[Row];
        break;
      
      default:
        break;
    } 
            
    switch (activation) 
    {
      case SWISH_ACTIVATION:
        // Swish activation function
        P[Row * W3 + Col] = Pvalue / (1.0f + expf(-1.0f * Pvalue));
        break;

      case SIGMOID_ACTIVATION:
        // Sigmoid activation function
        P[Row * W3 + Col] = 1.0f / (1.0f + expf(-1.0f * Pvalue));
        break;

      default:
        break;
    }
  }

  if ((Row < H1) && (Col + Tile_GEMM < W2))
    {
      P[Row * W3 + Col + Tile_GEMM] = Pvalue_2;
      
      switch (IS_BIASED) 
      {
        case BIASED:
          Pvalue_2 = Pvalue_2 + bias_mat[Row];
          break;
        
        default:
          break;
      } 
              
      switch (activation) 
      {
        case SWISH_ACTIVATION:
          // Swish activation function
          P[Row * W3 + Col + Tile_GEMM] = Pvalue_2 / (1.0f + expf(-1.0f * Pvalue_2));
          break;

        case SIGMOID_ACTIVATION:
          // Sigmoid activation function
          P[Row * W3 + Col + Tile_GEMM] = 1.0f / (1.0f + expf(-1.0f * Pvalue_2));
          break;

        default:
          break;
      }
    }    
    
}


__global__ void ConvChannelElementWiseMultiplication(float *A, int H1, int W1, int D1,
                                                     float *B)
{
    int row = blockIdx.y * blockDim.y + threadIdx.y;
    int col = blockIdx.x * blockDim.x + threadIdx.x;
    int depth = blockIdx.z;

    int index = depth * W1 * H1 + row * W1 + col;

    if ((row < H1) && (col < W1) && (depth < D1))
    {
        A[index] = A[index] * B[depth];
    }
}

__global__ void CastingDivision(float *A, int W1, float B)
{
    // Warning: 1-D kernel only in x dir.
 
    int col = blockIdx.x * blockDim.x + threadIdx.x;
 
    if ((col < W1))
    {
        A[col] /= B;
    }
}

// Used with MBConv layers that has skip identity = true
__global__ void Identity_Skip(float *A,  int H1, int W1, int D1,
                              float *B)
{
    int row = blockIdx.y * blockDim.y + threadIdx.y;
    int col = blockIdx.x * blockDim.x + threadIdx.x;
    int depth = blockIdx.z;

    int index = depth * W1 * H1 + row * W1 + col;

    if ((row < H1) && (col < W1) && (depth < D1))
    {
        A[index] = A[index] + B[index];
    }
}

__global__ void Complete_Padding_Process(float *Original_Padded, int H1, int W1, int D1, 
                                         float *Original,        int H2, int W2, int D2,
                                         int padding_value)
{   
    // There must be a constant shift between indeces in 2 matrices
    int row = blockIdx.y * blockDim.y + threadIdx.y;
    int col = blockIdx.x * blockDim.x + threadIdx.x;
    int depth = blockIdx.z * blockDim.z + threadIdx.z;

    int index = depth * W2 * H2 + row * W2 + col;
    int Padding_Index = depth * W1 * H1 + (row + padding_value) * W1 + (col + padding_value);

    if ((row < (H2)) && (col < (W2)) && (depth < (D2)))
    {
        Original_Padded[Padding_Index] = Original[index];
    }
}

/* Batch Normalization Kernels */
const int BLOCK_SIZE = 128;

__global__ void BN_Kernel_Mean_Reduction(float *input, int H1, int W1, int D1,
                                         float *Mean, int W2)
{
    /*
        This code works on 2 * Block_Size elements.
        i.e. for 512 Block_Size -> we are reducing 1024 elements.
        Each thread loads 2 elements, one at tx and the
        other shifted by blockIdx.x.
    */

    __shared__ float partialSum[2 * BLOCK_SIZE];
    float tmp = 0;

    int tx = threadIdx.x;
    int bx = blockDim.x;

    int by_index = blockIdx.y;
    int bx_index = blockIdx.x;

    // The start variable is to get offset for input matrix in loading
    int start = blockIdx.x * (2 * blockDim.x);
    int start_yDir = blockIdx.y * W1;

    if (start + tx < W1 && start_yDir < H1 * W1)
        // Load 2 elements in the shared memory
        partialSum[tx] = input[start + tx + start_yDir];
    else
        partialSum[tx] = tmp;

    if (tx + bx + start < W1 && start_yDir < H1 * W1)
        partialSum[bx + tx] = input[start + bx + tx + start_yDir];
    else
        partialSum[bx + tx] = tmp;


    unsigned int stride = 0;

    __syncthreads();

    for (stride = blockDim.x; stride > 0; stride = stride / 2.0f)
    {
        __syncthreads();
        if (tx < stride)
            partialSum[tx] += partialSum[tx + stride];
    }

    __syncthreads();


    if (tx == 0)
        Mean[bx_index + by_index * W2] = partialSum[tx];

}

__global__ void ElementWiseSquaring(float *A, int H1, int W1, int D1)
{
    int row = blockIdx.y * blockDim.y + threadIdx.y;
    int col = blockIdx.x * blockDim.x + threadIdx.x;
    int depth = blockIdx.z;

    int index = depth * W1 * H1 + row * W1 + col;

    if ((row < H1) && (col < W1) && (depth < D1))
    {
        A[index] = A[index] * A[index];
    }
}

__global__ void ElementWiseSubtraction(float *A, int H1, int W1, int D1,
                                       float *mean)
{
    int row = blockIdx.y * blockDim.y + threadIdx.y;
    int col = blockIdx.x * blockDim.x  + threadIdx.x;
    int depth = blockIdx.z;

    int index = depth * W1 * H1 + row * W1 + col;

    if ((row < H1) && (col < W1) && (depth < D1))
    {
        A[index] = A[index] - mean[depth];
    }
}


__global__ void BN_Kernel_Final_Layer(float *A, int H1, int W1, int D1, 
                                      float *D_mean, float *D_variance,
                                      float *D_weight, float *D_bias,
                                      int activate)
{
    // Activate values are assigned as follow
    /*
      0 -> no activation, 1 -> swish, 2 -> sigmoid
    */
    int row = blockIdx.y * blockDim.y + threadIdx.y;
    int col = blockIdx.x * blockDim.x + threadIdx.x;
    int depth = blockIdx.z;

    int index = depth * W1 * H1 + row * W1 + col;
    int index3 = depth;

    float tmp = 0;
 
    if ((row < H1) && (col < W1) && (depth < D1))
    {
        A[index] = ((A[index] - D_mean[index3]) / (sqrtf(D_variance[index3] + 0.001f))) * D_weight[index3] + D_bias[index3];
        tmp = A[index];

        switch (activate) {
                  case 1:
                      // Swish activation function
                      A[index] = tmp / (1.0f + expf(-1.0f * tmp));
                      break;
                  case 2:
                      // Sigmoid activation function
                      A[index] = 1.0f / (1.0f + expf(-1.0f * tmp));
                      break;
                  default:
                      break;
                    }
    }
}

'File written in /content/src/KERNELS.cu'

In [154]:
%%cuda --name APP.cu 

#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <math.h>
#include <cuda.h>
#include <cuda_runtime_api.h>
#include "device_launch_parameters.h"
#include <cusolverDn.h>
#include <cuda_runtime.h>


#include "/content/MBCONVS_float/Input_For_Stem_Layer.h"
#include "/content/MBCONVS_float/Stem/Stem_conv_parameters.h"
#include "/content/MBCONVS_float/functionsV2.h"
#include "/content/MBCONVS_float/CONFIG.h"
#include "/content/MBCONVS_float/Input_Matrix.h"
#include "/content/MBCONVS_float/KERNELSH.h"

#include "/content/MBCONVS_float/MBConv1_0/MBConv1_0_depthwise_conv_parameters.h"
#include "/content/MBCONVS_float/MBConv1_0/MBConv1_0_project_conv_parameters.h"
#include "/content/MBCONVS_float/MBConv1_0/MBConv1_0_squeeze_excitation_parameters.h"

#include "/content/MBCONVS_float/MbConv6_1/MBConv6_1_expansion_conv_parameters.h"
#include "/content/MBCONVS_float/MbConv6_1/MBConv6_1_depthwise_conv_parameters.h"
#include "/content/MBCONVS_float/MbConv6_1/MBConv6_1_squeeze_excitation_parameters.h"
#include "/content/MBCONVS_float/MbConv6_1/MBConv6_1_project_conv_parameters.h"

#include "/content/MBCONVS_float/MbConv6_2/MBConv6_2_expansion_conv_parameters.h"
#include "/content/MBCONVS_float/MbConv6_2/MBConv6_2_depthwise_conv_parameters.h"
#include "/content/MBCONVS_float/MbConv6_2/MBConv6_2_squeeze_excitation_parameters.h"
#include "/content/MBCONVS_float/MbConv6_2/MBConv6_2_project_conv_parameters.h"

#include "/content/MBCONVS_float/MbConv6_3/MBConv6_3_expansion_conv_parameters.h"
#include "/content/MBCONVS_float/MbConv6_3/MBConv6_3_depthwise_conv_parameters.h"
#include "/content/MBCONVS_float/MbConv6_3/MBConv6_3_squeeze_excitation_parameters.h"
#include "/content/MBCONVS_float/MbConv6_3/MBConv6_3_project_conv_parameters.h"

#include "/content/MBCONVS_float/MbConv6_4/MBConv6_4_expansion_conv_parameters.h"
#include "/content/MBCONVS_float/MbConv6_4/MBConv6_4_depthwise_conv_parameters.h"
#include "/content/MBCONVS_float/MbConv6_4/MBConv6_4_squeeze_excitation_parameters.h"
#include "/content/MBCONVS_float/MbConv6_4/MBConv6_4_project_conv_parameters.h"

#include "/content/MBCONVS_float/MbConv6_5/MBConv6_5_expansion_conv_parameters.h"
#include "/content/MBCONVS_float/MbConv6_5/MBConv6_5_depthwise_conv_parameters.h"
#include "/content/MBCONVS_float/MbConv6_5/MBConv6_5_squeeze_excitation_parameters.h"
#include "/content/MBCONVS_float/MbConv6_5/MBConv6_5_project_conv_parameters.h"

#include "/content/MBCONVS_float/MbConv6_6/MBConv6_6_expansion_conv_parameters.h"
#include "/content/MBCONVS_float/MbConv6_6/MBConv6_6_depthwise_conv_parameters.h"
#include "/content/MBCONVS_float/MbConv6_6/MBConv6_6_squeeze_excitation_parameters.h"
#include "/content/MBCONVS_float/MbConv6_6/MBConv6_6_project_conv_parameters.h"

#include "/content/MBCONVS_float/MbConv6_7/MBConv6_7_expansion_conv_parameters.h"
#include "/content/MBCONVS_float/MbConv6_7/MBConv6_7_depthwise_conv_parameters.h"
#include "/content/MBCONVS_float/MbConv6_7/MBConv6_7_squeeze_excitation_parameters.h"
#include "/content/MBCONVS_float/MbConv6_7/MBConv6_7_project_conv_parameters.h"

#include "/content/MBCONVS_float/MbConv6_8/MBConv6_8_expansion_conv_parameters.h"
#include "/content/MBCONVS_float/MbConv6_8/MBConv6_8_depthwise_conv_parameters.h"
#include "/content/MBCONVS_float/MbConv6_8/MBConv6_8_squeeze_excitation_parameters.h"
#include "/content/MBCONVS_float/MbConv6_8/MBConv6_8_project_conv_parameters.h"

#include "/content/MBCONVS_float/MbConv6_9/MBConv6_9_expansion_conv_parameters.h"
#include "/content/MBCONVS_float/MbConv6_9/MBConv6_9_depthwise_conv_parameters.h"
#include "/content/MBCONVS_float/MbConv6_9/MBConv6_9_squeeze_excitation_parameters.h"
#include "/content/MBCONVS_float/MbConv6_9/MBConv6_9_project_conv_parameters.h"

#include "/content/MBCONVS_float/MbConv6_10/MBConv6_10_expansion_conv_parameters.h"
#include "/content/MBCONVS_float/MbConv6_10/MBConv6_10_depthwise_conv_parameters.h"
#include "/content/MBCONVS_float/MbConv6_10/MBConv6_10_squeeze_excitation_parameters.h"
#include "/content/MBCONVS_float/MbConv6_10/MBConv6_10_project_conv_parameters.h"

#include "/content/MBCONVS_float/MbConv6_11/MBConv6_11_expansion_conv_parameters.h"
#include "/content/MBCONVS_float/MbConv6_11/MBConv6_11_depthwise_conv_parameters.h"
#include "/content/MBCONVS_float/MbConv6_11/MBConv6_11_squeeze_excitation_parameters.h"
#include "/content/MBCONVS_float/MbConv6_11/MBConv6_11_project_conv_parameters.h"

#include "/content/MBCONVS_float/MbConv6_12/MBConv6_12_expansion_conv_parameters.h"
#include "/content/MBCONVS_float/MbConv6_12/MBConv6_12_depthwise_conv_parameters.h"
#include "/content/MBCONVS_float/MbConv6_12/MBConv6_12_squeeze_excitation_parameters.h"
#include "/content/MBCONVS_float/MbConv6_12/MBConv6_12_project_conv_parameters.h"

#include "/content/MBCONVS_float/MbConv6_13/MBConv6_13_expansion_conv_parameters.h"
#include "/content/MBCONVS_float/MbConv6_13/MBConv6_13_depthwise_conv_parameters.h"
#include "/content/MBCONVS_float/MbConv6_13/MBConv6_13_squeeze_excitation_parameters.h"
#include "/content/MBCONVS_float/MbConv6_13/MBConv6_13_project_conv_parameters.h"

#include "/content/MBCONVS_float/MbConv6_14/MBConv6_14_expansion_conv_parameters.h"
#include "/content/MBCONVS_float/MbConv6_14/MBConv6_14_depthwise_conv_parameters.h"
#include "/content/MBCONVS_float/MbConv6_14/MBConv6_14_squeeze_excitation_parameters.h"
#include "/content/MBCONVS_float/MbConv6_14/MBConv6_14_project_conv_parameters.h"

#include "/content/MBCONVS_float/MbConv6_15/MBConv6_15_expansion_conv_parameters.h"
#include "/content/MBCONVS_float/MbConv6_15/MBConv6_15_depthwise_conv_parameters.h"
#include "/content/MBCONVS_float/MbConv6_15/MBConv6_15_squeeze_excitation_parameters.h"
#include "/content/MBCONVS_float/MbConv6_15/MBConv6_15_project_conv_parameters.h"

#include "/content/MBCONVS_float/Head/Head_conv_parameters.h"


int MBCONV1_0_flag = 0;

int main()
{
  // 1. Define dimensions for input image.
  set_allocate_copy_array_Device(&DInput_Mat, Input_for_stem_conv,
                                 INPUT_IMAGE_HEIGHT, INPUT_IMAGE_WIDTH, 
                                 INPUT_IMAGE_DEPTH,
                                 "Input Image is allocated in device memory");  

  // 2. Get layers' filters ready
  set_allocate_copy_array_Device(&F_STEM, Stem_conv2d_weights,
                                 STEM_FILTER_HEIGHT, STEM_FILTER_WIDTH, 
                                 STEM_FILTER_DEPTH * STEM_FILTER_DENSITY,
                                 "Stem Filter  is allocated in device memory");
  
  DEFINE_FILTERS_FOR_MBCONV(&D_MBConv_1_0_EXPD_WEIGHTS, NULL, 
                            MBCONV_1_0_EXPD_F_HEIGHT,   MBCONV_1_0_EXPD_F_WIDTH, 
                            MBCONV_1_0_EXPD_F_DEPTH * MBCONV_1_0_EXPD_F_DENSITY,
                            &D_MBConv_1_0_DW_WEIGHTS, MBConv1_0_depthwise_conv_conv2d_weights, 
                            MBCONV_1_0_DW_F_HEIGHT, MBCONV_1_0_DW_F_WIDTH, 
                            MBCONV_1_0_DW_F_DEPTH * MBCONV_1_0_DW_F_DENSITY,
                            &D_MBConv_1_0_SQZ_1_WEIGHTS, MBConv1_0_squeeze_excitation1_conv2d_weights,
                            MBCONV_1_0_SQZ_1_F_HEIGHT, MBCONV_1_0_SQZ_1_F_WIDTH, 
                            MBCONV_1_0_SQZ_1_F_DEPTH * MBCONV_1_0_SQZ_1_F_DENSITY,
                            &D_MBConv_1_0_SQZ_2_WEIGHTS, MBConv1_0_squeeze_excitation2_conv2d_weights, 
                            MBCONV_1_0_SQZ_2_F_HEIGHT, MBCONV_1_0_SQZ_2_F_WIDTH, 
                            MBCONV_1_0_SQZ_2_F_DEPTH * MBCONV_1_0_SQZ_2_F_DENSITY,
                            &D_MBConv_1_0_PRJ_WEIGHTS, MBConv1_0_project_conv_conv2d_weights, 
                            MBCONV_1_0_PRJ_F_HEIGHT, MBCONV_1_0_PRJ_F_WIDTH, 
                            MBCONV_1_0_PRJ_F_DEPTH * MBCONV_1_0_PRJ_F_DENSITY); 

  DEFINE_FILTERS_FOR_MBCONV(&D_MBConv_6_1_EXPD_WEIGHTS, MBConv6_1_expansion_conv_conv2d_weights, 
                            MBCONV_6_1_EXPD_F_HEIGHT,   MBCONV_6_1_EXPD_F_WIDTH, 
                            MBCONV_6_1_EXPD_F_DEPTH * MBCONV_6_1_EXPD_F_DENSITY,
                            &D_MBConv_6_1_DW_WEIGHTS, MBConv6_1_depthwise_conv_conv2d_weights, 
                            MBCONV_6_1_DW_F_HEIGHT, MBCONV_6_1_DW_F_WIDTH, 
                            MBCONV_6_1_DW_F_DEPTH * MBCONV_6_1_DW_F_DENSITY,
                            &D_MBConv_6_1_SQZ_1_WEIGHTS, MBConv6_1_squeeze_excitation1_conv2d_weights,
                            MBCONV_6_1_SQZ_1_F_HEIGHT, MBCONV_6_1_SQZ_1_F_WIDTH, 
                            MBCONV_6_1_SQZ_1_F_DEPTH * MBCONV_6_1_SQZ_1_F_DENSITY,
                            &D_MBConv_6_1_SQZ_2_WEIGHTS, MBConv6_1_squeeze_excitation2_conv2d_weights, 
                            MBCONV_6_1_SQZ_2_F_HEIGHT, MBCONV_6_1_SQZ_2_F_WIDTH, 
                            MBCONV_6_1_SQZ_2_F_DEPTH * MBCONV_6_1_SQZ_2_F_DENSITY,
                            &D_MBConv_6_1_PRJ_WEIGHTS, MBConv6_1_project_conv_conv2d_weights, 
                            MBCONV_6_1_PRJ_F_HEIGHT, MBCONV_6_1_PRJ_F_WIDTH, 
                            MBCONV_6_1_PRJ_F_DEPTH * MBCONV_6_1_PRJ_F_DENSITY); 

  DEFINE_FILTERS_FOR_MBCONV(&D_MBConv_6_2_EXPD_WEIGHTS, MBConv6_2_expansion_conv_conv2d_weights, 
                            MBCONV_6_2_EXPD_F_HEIGHT,   MBCONV_6_2_EXPD_F_WIDTH, 
                            MBCONV_6_2_EXPD_F_DEPTH * MBCONV_6_2_EXPD_F_DENSITY,
                            &D_MBConv_6_2_DW_WEIGHTS, MBConv6_2_depthwise_conv_conv2d_weights, 
                            MBCONV_6_2_DW_F_HEIGHT, MBCONV_6_2_DW_F_WIDTH, 
                            MBCONV_6_2_DW_F_DEPTH * MBCONV_6_2_DW_F_DENSITY,
                            &D_MBConv_6_2_SQZ_1_WEIGHTS, MBConv6_2_squeeze_excitation1_conv2d_weights,
                            MBCONV_6_2_SQZ_1_F_HEIGHT, MBCONV_6_2_SQZ_1_F_WIDTH, 
                            MBCONV_6_2_SQZ_1_F_DEPTH * MBCONV_6_2_SQZ_1_F_DENSITY,
                            &D_MBConv_6_2_SQZ_2_WEIGHTS, MBConv6_2_squeeze_excitation2_conv2d_weights, 
                            MBCONV_6_2_SQZ_2_F_HEIGHT, MBCONV_6_2_SQZ_2_F_WIDTH, 
                            MBCONV_6_2_SQZ_2_F_DEPTH * MBCONV_6_2_SQZ_2_F_DENSITY,
                            &D_MBConv_6_2_PRJ_WEIGHTS, MBConv6_2_project_conv_conv2d_weights, 
                            MBCONV_6_2_PRJ_F_HEIGHT, MBCONV_6_2_PRJ_F_WIDTH, 
                            MBCONV_6_2_PRJ_F_DEPTH * MBCONV_6_2_PRJ_F_DENSITY);

  DEFINE_FILTERS_FOR_MBCONV(&D_MBConv_6_3_EXPD_WEIGHTS, MBConv6_3_expansion_conv_conv2d_weights, 
                            MBCONV_6_3_EXPD_F_HEIGHT,   MBCONV_6_3_EXPD_F_WIDTH, 
                            MBCONV_6_3_EXPD_F_DEPTH * MBCONV_6_3_EXPD_F_DENSITY,
                            &D_MBConv_6_3_DW_WEIGHTS, MBConv6_3_depthwise_conv_conv2d_weights, 
                            MBCONV_6_3_DW_F_HEIGHT, MBCONV_6_3_DW_F_WIDTH, 
                            MBCONV_6_3_DW_F_DEPTH * MBCONV_6_3_DW_F_DENSITY,
                            &D_MBConv_6_3_SQZ_1_WEIGHTS, MBConv6_3_squeeze_excitation1_conv2d_weights,
                            MBCONV_6_3_SQZ_1_F_HEIGHT, MBCONV_6_3_SQZ_1_F_WIDTH, 
                            MBCONV_6_3_SQZ_1_F_DEPTH * MBCONV_6_3_SQZ_1_F_DENSITY,
                            &D_MBConv_6_3_SQZ_2_WEIGHTS, MBConv6_3_squeeze_excitation2_conv2d_weights, 
                            MBCONV_6_3_SQZ_2_F_HEIGHT, MBCONV_6_3_SQZ_2_F_WIDTH, 
                            MBCONV_6_3_SQZ_2_F_DEPTH * MBCONV_6_3_SQZ_2_F_DENSITY,
                            &D_MBConv_6_3_PRJ_WEIGHTS, MBConv6_3_project_conv_conv2d_weights, 
                            MBCONV_6_3_PRJ_F_HEIGHT, MBCONV_6_3_PRJ_F_WIDTH, 
                            MBCONV_6_3_PRJ_F_DEPTH * MBCONV_6_3_PRJ_F_DENSITY);


  DEFINE_FILTERS_FOR_MBCONV(&D_MBConv_6_4_EXPD_WEIGHTS, MBConv6_4_expansion_conv_conv2d_weights, 
                            MBCONV_6_4_EXPD_F_HEIGHT,   MBCONV_6_4_EXPD_F_WIDTH, 
                            MBCONV_6_4_EXPD_F_DEPTH * MBCONV_6_4_EXPD_F_DENSITY,
                            &D_MBConv_6_4_DW_WEIGHTS, MBConv6_4_depthwise_conv_conv2d_weights, 
                            MBCONV_6_4_DW_F_HEIGHT, MBCONV_6_4_DW_F_WIDTH, 
                            MBCONV_6_4_DW_F_DEPTH * MBCONV_6_4_DW_F_DENSITY,
                            &D_MBConv_6_4_SQZ_1_WEIGHTS, MBConv6_4_squeeze_excitation1_conv2d_weights,
                            MBCONV_6_4_SQZ_1_F_HEIGHT, MBCONV_6_4_SQZ_1_F_WIDTH, 
                            MBCONV_6_4_SQZ_1_F_DEPTH * MBCONV_6_4_SQZ_1_F_DENSITY,
                            &D_MBConv_6_4_SQZ_2_WEIGHTS, MBConv6_4_squeeze_excitation2_conv2d_weights, 
                            MBCONV_6_4_SQZ_2_F_HEIGHT, MBCONV_6_4_SQZ_2_F_WIDTH, 
                            MBCONV_6_4_SQZ_2_F_DEPTH * MBCONV_6_4_SQZ_2_F_DENSITY,
                            &D_MBConv_6_4_PRJ_WEIGHTS, MBConv6_4_project_conv_conv2d_weights, 
                            MBCONV_6_4_PRJ_F_HEIGHT, MBCONV_6_4_PRJ_F_WIDTH, 
                            MBCONV_6_4_PRJ_F_DEPTH * MBCONV_6_4_PRJ_F_DENSITY);

  DEFINE_FILTERS_FOR_MBCONV(&D_MBConv_6_5_EXPD_WEIGHTS, MBConv6_5_expansion_conv_conv2d_weights, 
                            MBCONV_6_5_EXPD_F_HEIGHT,   MBCONV_6_5_EXPD_F_WIDTH, 
                            MBCONV_6_5_EXPD_F_DEPTH * MBCONV_6_5_EXPD_F_DENSITY,
                            &D_MBConv_6_5_DW_WEIGHTS, MBConv6_5_depthwise_conv_conv2d_weights, 
                            MBCONV_6_5_DW_F_HEIGHT, MBCONV_6_5_DW_F_WIDTH, 
                            MBCONV_6_5_DW_F_DEPTH * MBCONV_6_5_DW_F_DENSITY,
                            &D_MBConv_6_5_SQZ_1_WEIGHTS, MBConv6_5_squeeze_excitation1_conv2d_weights,
                            MBCONV_6_5_SQZ_1_F_HEIGHT, MBCONV_6_5_SQZ_1_F_WIDTH, 
                            MBCONV_6_5_SQZ_1_F_DEPTH * MBCONV_6_5_SQZ_1_F_DENSITY,
                            &D_MBConv_6_5_SQZ_2_WEIGHTS, MBConv6_5_squeeze_excitation2_conv2d_weights, 
                            MBCONV_6_5_SQZ_2_F_HEIGHT, MBCONV_6_5_SQZ_2_F_WIDTH, 
                            MBCONV_6_5_SQZ_2_F_DEPTH * MBCONV_6_5_SQZ_2_F_DENSITY,
                            &D_MBConv_6_5_PRJ_WEIGHTS, MBConv6_5_project_conv_conv2d_weights, 
                            MBCONV_6_5_PRJ_F_HEIGHT, MBCONV_6_5_PRJ_F_WIDTH, 
                            MBCONV_6_5_PRJ_F_DEPTH * MBCONV_6_5_PRJ_F_DENSITY);
     
  DEFINE_FILTERS_FOR_MBCONV(&D_MBConv_6_6_EXPD_WEIGHTS, MBConv6_6_expansion_conv_conv2d_weights, 
                            MBCONV_6_6_EXPD_F_HEIGHT,   MBCONV_6_6_EXPD_F_WIDTH, 
                            MBCONV_6_6_EXPD_F_DEPTH * MBCONV_6_6_EXPD_F_DENSITY,
                            &D_MBConv_6_6_DW_WEIGHTS, MBConv6_6_depthwise_conv_conv2d_weights, 
                            MBCONV_6_6_DW_F_HEIGHT, MBCONV_6_6_DW_F_WIDTH, 
                            MBCONV_6_6_DW_F_DEPTH * MBCONV_6_6_DW_F_DENSITY,
                            &D_MBConv_6_6_SQZ_1_WEIGHTS, MBConv6_6_squeeze_excitation1_conv2d_weights,
                            MBCONV_6_6_SQZ_1_F_HEIGHT, MBCONV_6_6_SQZ_1_F_WIDTH, 
                            MBCONV_6_6_SQZ_1_F_DEPTH * MBCONV_6_6_SQZ_1_F_DENSITY,
                            &D_MBConv_6_6_SQZ_2_WEIGHTS, MBConv6_6_squeeze_excitation2_conv2d_weights, 
                            MBCONV_6_6_SQZ_2_F_HEIGHT, MBCONV_6_6_SQZ_2_F_WIDTH, 
                            MBCONV_6_6_SQZ_2_F_DEPTH * MBCONV_6_6_SQZ_2_F_DENSITY,
                            &D_MBConv_6_6_PRJ_WEIGHTS, MBConv6_6_project_conv_conv2d_weights, 
                            MBCONV_6_6_PRJ_F_HEIGHT, MBCONV_6_6_PRJ_F_WIDTH, 
                            MBCONV_6_6_PRJ_F_DEPTH * MBCONV_6_6_PRJ_F_DENSITY);
     
  DEFINE_FILTERS_FOR_MBCONV(&D_MBConv_6_7_EXPD_WEIGHTS, MBConv6_7_expansion_conv_conv2d_weights, 
                            MBCONV_6_7_EXPD_F_HEIGHT,   MBCONV_6_7_EXPD_F_WIDTH, 
                            MBCONV_6_7_EXPD_F_DEPTH * MBCONV_6_7_EXPD_F_DENSITY,
                            &D_MBConv_6_7_DW_WEIGHTS, MBConv6_7_depthwise_conv_conv2d_weights, 
                            MBCONV_6_7_DW_F_HEIGHT, MBCONV_6_7_DW_F_WIDTH, 
                            MBCONV_6_7_DW_F_DEPTH * MBCONV_6_7_DW_F_DENSITY,
                            &D_MBConv_6_7_SQZ_1_WEIGHTS, MBConv6_7_squeeze_excitation1_conv2d_weights,
                            MBCONV_6_7_SQZ_1_F_HEIGHT, MBCONV_6_7_SQZ_1_F_WIDTH, 
                            MBCONV_6_7_SQZ_1_F_DEPTH * MBCONV_6_7_SQZ_1_F_DENSITY,
                            &D_MBConv_6_7_SQZ_2_WEIGHTS, MBConv6_7_squeeze_excitation2_conv2d_weights, 
                            MBCONV_6_7_SQZ_2_F_HEIGHT, MBCONV_6_7_SQZ_2_F_WIDTH, 
                            MBCONV_6_7_SQZ_2_F_DEPTH * MBCONV_6_7_SQZ_2_F_DENSITY,
                            &D_MBConv_6_7_PRJ_WEIGHTS, MBConv6_7_project_conv_conv2d_weights, 
                            MBCONV_6_7_PRJ_F_HEIGHT, MBCONV_6_7_PRJ_F_WIDTH, 
                            MBCONV_6_7_PRJ_F_DEPTH * MBCONV_6_7_PRJ_F_DENSITY);


  DEFINE_FILTERS_FOR_MBCONV(&D_MBConv_6_8_EXPD_WEIGHTS, MBConv6_8_expansion_conv_conv2d_weights, 
                            MBCONV_6_8_EXPD_F_HEIGHT,   MBCONV_6_8_EXPD_F_WIDTH, 
                            MBCONV_6_8_EXPD_F_DEPTH * MBCONV_6_8_EXPD_F_DENSITY,
                            &D_MBConv_6_8_DW_WEIGHTS, MBConv6_8_depthwise_conv_conv2d_weights, 
                            MBCONV_6_8_DW_F_HEIGHT, MBCONV_6_8_DW_F_WIDTH, 
                            MBCONV_6_8_DW_F_DEPTH * MBCONV_6_8_DW_F_DENSITY,
                            &D_MBConv_6_8_SQZ_1_WEIGHTS, MBConv6_8_squeeze_excitation1_conv2d_weights,
                            MBCONV_6_8_SQZ_1_F_HEIGHT, MBCONV_6_8_SQZ_1_F_WIDTH, 
                            MBCONV_6_8_SQZ_1_F_DEPTH * MBCONV_6_8_SQZ_1_F_DENSITY,
                            &D_MBConv_6_8_SQZ_2_WEIGHTS, MBConv6_8_squeeze_excitation2_conv2d_weights, 
                            MBCONV_6_8_SQZ_2_F_HEIGHT, MBCONV_6_8_SQZ_2_F_WIDTH, 
                            MBCONV_6_8_SQZ_2_F_DEPTH * MBCONV_6_8_SQZ_2_F_DENSITY,
                            &D_MBConv_6_8_PRJ_WEIGHTS, MBConv6_8_project_conv_conv2d_weights, 
                            MBCONV_6_8_PRJ_F_HEIGHT, MBCONV_6_8_PRJ_F_WIDTH, 
                            MBCONV_6_8_PRJ_F_DEPTH * MBCONV_6_8_PRJ_F_DENSITY);

  DEFINE_FILTERS_FOR_MBCONV(&D_MBConv_6_9_EXPD_WEIGHTS, MBConv6_9_expansion_conv_conv2d_weights, 
                            MBCONV_6_9_EXPD_F_HEIGHT,   MBCONV_6_9_EXPD_F_WIDTH, 
                            MBCONV_6_9_EXPD_F_DEPTH * MBCONV_6_9_EXPD_F_DENSITY,
                            &D_MBConv_6_9_DW_WEIGHTS, MBConv6_9_depthwise_conv_conv2d_weights, 
                            MBCONV_6_9_DW_F_HEIGHT, MBCONV_6_9_DW_F_WIDTH, 
                            MBCONV_6_9_DW_F_DEPTH * MBCONV_6_9_DW_F_DENSITY,
                            &D_MBConv_6_9_SQZ_1_WEIGHTS, MBConv6_9_squeeze_excitation1_conv2d_weights,
                            MBCONV_6_9_SQZ_1_F_HEIGHT, MBCONV_6_9_SQZ_1_F_WIDTH, 
                            MBCONV_6_9_SQZ_1_F_DEPTH * MBCONV_6_9_SQZ_1_F_DENSITY,
                            &D_MBConv_6_9_SQZ_2_WEIGHTS, MBConv6_9_squeeze_excitation2_conv2d_weights, 
                            MBCONV_6_9_SQZ_2_F_HEIGHT, MBCONV_6_9_SQZ_2_F_WIDTH, 
                            MBCONV_6_9_SQZ_2_F_DEPTH * MBCONV_6_9_SQZ_2_F_DENSITY,
                            &D_MBConv_6_9_PRJ_WEIGHTS, MBConv6_9_project_conv_conv2d_weights, 
                            MBCONV_6_9_PRJ_F_HEIGHT, MBCONV_6_9_PRJ_F_WIDTH, 
                            MBCONV_6_9_PRJ_F_DEPTH * MBCONV_6_9_PRJ_F_DENSITY);


  DEFINE_FILTERS_FOR_MBCONV(&D_MBConv_6_10_EXPD_WEIGHTS, MBConv6_10_expansion_conv_conv2d_weights, 
                            MBCONV_6_10_EXPD_F_HEIGHT,   MBCONV_6_10_EXPD_F_WIDTH, 
                            MBCONV_6_10_EXPD_F_DEPTH * MBCONV_6_10_EXPD_F_DENSITY,
                            &D_MBConv_6_10_DW_WEIGHTS, MBConv6_10_depthwise_conv_conv2d_weights, 
                            MBCONV_6_10_DW_F_HEIGHT, MBCONV_6_10_DW_F_WIDTH, 
                            MBCONV_6_10_DW_F_DEPTH * MBCONV_6_10_DW_F_DENSITY,
                            &D_MBConv_6_10_SQZ_1_WEIGHTS, MBConv6_10_squeeze_excitation1_conv2d_weights,
                            MBCONV_6_10_SQZ_1_F_HEIGHT, MBCONV_6_10_SQZ_1_F_WIDTH, 
                            MBCONV_6_10_SQZ_1_F_DEPTH * MBCONV_6_10_SQZ_1_F_DENSITY,
                            &D_MBConv_6_10_SQZ_2_WEIGHTS, MBConv6_10_squeeze_excitation2_conv2d_weights, 
                            MBCONV_6_10_SQZ_2_F_HEIGHT, MBCONV_6_10_SQZ_2_F_WIDTH, 
                            MBCONV_6_10_SQZ_2_F_DEPTH * MBCONV_6_10_SQZ_2_F_DENSITY,
                            &D_MBConv_6_10_PRJ_WEIGHTS, MBConv6_10_project_conv_conv2d_weights, 
                            MBCONV_6_10_PRJ_F_HEIGHT, MBCONV_6_10_PRJ_F_WIDTH, 
                            MBCONV_6_10_PRJ_F_DEPTH * MBCONV_6_10_PRJ_F_DENSITY);


  DEFINE_FILTERS_FOR_MBCONV(&D_MBConv_6_11_EXPD_WEIGHTS, MBConv6_11_expansion_conv_conv2d_weights, 
                            MBCONV_6_11_EXPD_F_HEIGHT,   MBCONV_6_11_EXPD_F_WIDTH, 
                            MBCONV_6_11_EXPD_F_DEPTH * MBCONV_6_11_EXPD_F_DENSITY,
                            &D_MBConv_6_11_DW_WEIGHTS, MBConv6_11_depthwise_conv_conv2d_weights, 
                            MBCONV_6_11_DW_F_HEIGHT, MBCONV_6_11_DW_F_WIDTH, 
                            MBCONV_6_11_DW_F_DEPTH * MBCONV_6_11_DW_F_DENSITY,
                            &D_MBConv_6_11_SQZ_1_WEIGHTS, MBConv6_11_squeeze_excitation1_conv2d_weights,
                            MBCONV_6_11_SQZ_1_F_HEIGHT, MBCONV_6_11_SQZ_1_F_WIDTH, 
                            MBCONV_6_11_SQZ_1_F_DEPTH * MBCONV_6_11_SQZ_1_F_DENSITY,
                            &D_MBConv_6_11_SQZ_2_WEIGHTS, MBConv6_11_squeeze_excitation2_conv2d_weights, 
                            MBCONV_6_11_SQZ_2_F_HEIGHT, MBCONV_6_11_SQZ_2_F_WIDTH, 
                            MBCONV_6_11_SQZ_2_F_DEPTH * MBCONV_6_11_SQZ_2_F_DENSITY,
                            &D_MBConv_6_11_PRJ_WEIGHTS, MBConv6_11_project_conv_conv2d_weights, 
                            MBCONV_6_11_PRJ_F_HEIGHT, MBCONV_6_11_PRJ_F_WIDTH, 
                            MBCONV_6_11_PRJ_F_DEPTH * MBCONV_6_11_PRJ_F_DENSITY);

 
  DEFINE_FILTERS_FOR_MBCONV(&D_MBConv_6_12_EXPD_WEIGHTS, MBConv6_12_expansion_conv_conv2d_weights, 
                            MBCONV_6_12_EXPD_F_HEIGHT,   MBCONV_6_12_EXPD_F_WIDTH, 
                            MBCONV_6_12_EXPD_F_DEPTH * MBCONV_6_12_EXPD_F_DENSITY,
                            &D_MBConv_6_12_DW_WEIGHTS, MBConv6_12_depthwise_conv_conv2d_weights, 
                            MBCONV_6_12_DW_F_HEIGHT, MBCONV_6_12_DW_F_WIDTH, 
                            MBCONV_6_12_DW_F_DEPTH * MBCONV_6_12_DW_F_DENSITY,
                            &D_MBConv_6_12_SQZ_1_WEIGHTS, MBConv6_12_squeeze_excitation1_conv2d_weights,
                            MBCONV_6_12_SQZ_1_F_HEIGHT, MBCONV_6_12_SQZ_1_F_WIDTH, 
                            MBCONV_6_12_SQZ_1_F_DEPTH * MBCONV_6_12_SQZ_1_F_DENSITY,
                            &D_MBConv_6_12_SQZ_2_WEIGHTS, MBConv6_12_squeeze_excitation2_conv2d_weights, 
                            MBCONV_6_12_SQZ_2_F_HEIGHT, MBCONV_6_12_SQZ_2_F_WIDTH, 
                            MBCONV_6_12_SQZ_2_F_DEPTH * MBCONV_6_12_SQZ_2_F_DENSITY,
                            &D_MBConv_6_12_PRJ_WEIGHTS, MBConv6_12_project_conv_conv2d_weights, 
                            MBCONV_6_12_PRJ_F_HEIGHT, MBCONV_6_12_PRJ_F_WIDTH, 
                            MBCONV_6_12_PRJ_F_DEPTH * MBCONV_6_12_PRJ_F_DENSITY);


  DEFINE_FILTERS_FOR_MBCONV(&D_MBConv_6_13_EXPD_WEIGHTS, MBConv6_13_expansion_conv_conv2d_weights, 
                            MBCONV_6_13_EXPD_F_HEIGHT,   MBCONV_6_13_EXPD_F_WIDTH, 
                            MBCONV_6_13_EXPD_F_DEPTH * MBCONV_6_13_EXPD_F_DENSITY,
                            &D_MBConv_6_13_DW_WEIGHTS, MBConv6_13_depthwise_conv_conv2d_weights, 
                            MBCONV_6_13_DW_F_HEIGHT, MBCONV_6_13_DW_F_WIDTH, 
                            MBCONV_6_13_DW_F_DEPTH * MBCONV_6_13_DW_F_DENSITY,
                            &D_MBConv_6_13_SQZ_1_WEIGHTS, MBConv6_13_squeeze_excitation1_conv2d_weights,
                            MBCONV_6_13_SQZ_1_F_HEIGHT, MBCONV_6_13_SQZ_1_F_WIDTH, 
                            MBCONV_6_13_SQZ_1_F_DEPTH * MBCONV_6_13_SQZ_1_F_DENSITY,
                            &D_MBConv_6_13_SQZ_2_WEIGHTS, MBConv6_13_squeeze_excitation2_conv2d_weights, 
                            MBCONV_6_13_SQZ_2_F_HEIGHT, MBCONV_6_13_SQZ_2_F_WIDTH, 
                            MBCONV_6_13_SQZ_2_F_DEPTH * MBCONV_6_13_SQZ_2_F_DENSITY,
                            &D_MBConv_6_13_PRJ_WEIGHTS, MBConv6_13_project_conv_conv2d_weights, 
                            MBCONV_6_13_PRJ_F_HEIGHT, MBCONV_6_13_PRJ_F_WIDTH, 
                            MBCONV_6_13_PRJ_F_DEPTH * MBCONV_6_13_PRJ_F_DENSITY);


  DEFINE_FILTERS_FOR_MBCONV(&D_MBConv_6_14_EXPD_WEIGHTS, MBConv6_14_expansion_conv_conv2d_weights, 
                            MBCONV_6_14_EXPD_F_HEIGHT,   MBCONV_6_14_EXPD_F_WIDTH, 
                            MBCONV_6_14_EXPD_F_DEPTH * MBCONV_6_14_EXPD_F_DENSITY,
                            &D_MBConv_6_14_DW_WEIGHTS, MBConv6_14_depthwise_conv_conv2d_weights, 
                            MBCONV_6_14_DW_F_HEIGHT, MBCONV_6_14_DW_F_WIDTH, 
                            MBCONV_6_14_DW_F_DEPTH * MBCONV_6_14_DW_F_DENSITY,
                            &D_MBConv_6_14_SQZ_1_WEIGHTS, MBConv6_14_squeeze_excitation1_conv2d_weights,
                            MBCONV_6_14_SQZ_1_F_HEIGHT, MBCONV_6_14_SQZ_1_F_WIDTH, 
                            MBCONV_6_14_SQZ_1_F_DEPTH * MBCONV_6_14_SQZ_1_F_DENSITY,
                            &D_MBConv_6_14_SQZ_2_WEIGHTS, MBConv6_14_squeeze_excitation2_conv2d_weights, 
                            MBCONV_6_14_SQZ_2_F_HEIGHT, MBCONV_6_14_SQZ_2_F_WIDTH, 
                            MBCONV_6_14_SQZ_2_F_DEPTH * MBCONV_6_14_SQZ_2_F_DENSITY,
                            &D_MBConv_6_14_PRJ_WEIGHTS, MBConv6_14_project_conv_conv2d_weights, 
                            MBCONV_6_14_PRJ_F_HEIGHT, MBCONV_6_14_PRJ_F_WIDTH, 
                            MBCONV_6_14_PRJ_F_DEPTH * MBCONV_6_14_PRJ_F_DENSITY);
     
  DEFINE_FILTERS_FOR_MBCONV(&D_MBConv_6_15_EXPD_WEIGHTS, MBConv6_15_expansion_conv_conv2d_weights, 
                            MBCONV_6_15_EXPD_F_HEIGHT,   MBCONV_6_15_EXPD_F_WIDTH, 
                            MBCONV_6_15_EXPD_F_DEPTH * MBCONV_6_15_EXPD_F_DENSITY,
                            &D_MBConv_6_15_DW_WEIGHTS, MBConv6_15_depthwise_conv_conv2d_weights, 
                            MBCONV_6_15_DW_F_HEIGHT, MBCONV_6_15_DW_F_WIDTH, 
                            MBCONV_6_15_DW_F_DEPTH * MBCONV_6_15_DW_F_DENSITY,
                            &D_MBConv_6_15_SQZ_1_WEIGHTS, MBConv6_15_squeeze_excitation1_conv2d_weights,
                            MBCONV_6_15_SQZ_1_F_HEIGHT, MBCONV_6_15_SQZ_1_F_WIDTH, 
                            MBCONV_6_15_SQZ_1_F_DEPTH * MBCONV_6_15_SQZ_1_F_DENSITY,
                            &D_MBConv_6_15_SQZ_2_WEIGHTS, MBConv6_15_squeeze_excitation2_conv2d_weights, 
                            MBCONV_6_15_SQZ_2_F_HEIGHT, MBCONV_6_15_SQZ_2_F_WIDTH, 
                            MBCONV_6_15_SQZ_2_F_DEPTH * MBCONV_6_15_SQZ_2_F_DENSITY,
                            &D_MBConv_6_15_PRJ_WEIGHTS, MBConv6_15_project_conv_conv2d_weights, 
                            MBCONV_6_15_PRJ_F_HEIGHT, MBCONV_6_15_PRJ_F_WIDTH, 
                            MBCONV_6_15_PRJ_F_DEPTH * MBCONV_6_15_PRJ_F_DENSITY);


  set_allocate_copy_array_Device(&HEAD_CONV_WEIGHTS, Head_conv2d_weights,
                                  HEAD_CONV_F_HEIGHT, HEAD_CONV_F_WIDTH, HEAD_CONV_F_DEPTH * HEAD_CONV_F_DENSITY,
                                "Head Filter  is allocated in device memory");   
 
  set_allocate_copy_array_Device(&HEAD_FC_WEIGHTS, Head_linear_weights,
                                HEAD_FC_F_HEIGHT, HEAD_FC_F_WIDTH, 1,
                                "Fully Connected weights matrix is allocated in device memory");  
  
  // Define bias matrices for all squeeze layers
  set_allocate_copy_array_Device(&MBConv6_15_SQZ_1_bias, MBConv6_15_squeeze_excitation1_conv2d_bias,
                                  sizeof(MBConv6_15_squeeze_excitation1_conv2d_bias)/sizeof(float), 1, 1,
                                  "Bias for squeeze 1 layer #15");  
  set_allocate_copy_array_Device(&MBConv6_14_SQZ_1_bias, MBConv6_14_squeeze_excitation1_conv2d_bias,
                                  sizeof(MBConv6_14_squeeze_excitation1_conv2d_bias)/sizeof(float), 1, 1,
                                  "Bias for squeeze 1 layer #14");
  set_allocate_copy_array_Device(&MBConv6_13_SQZ_1_bias, MBConv6_13_squeeze_excitation1_conv2d_bias,
                                  sizeof(MBConv6_13_squeeze_excitation1_conv2d_bias)/sizeof(float), 1, 1,
                                  "Bias for squeeze 1 layer #13");
  set_allocate_copy_array_Device(&MBConv6_12_SQZ_1_bias, MBConv6_12_squeeze_excitation1_conv2d_bias,
                                  sizeof(MBConv6_12_squeeze_excitation1_conv2d_bias)/sizeof(float), 1, 1,
                                  "Bias for squeeze 1 layer #12");
  set_allocate_copy_array_Device(&MBConv6_11_SQZ_1_bias, MBConv6_11_squeeze_excitation1_conv2d_bias,
                                  sizeof(MBConv6_11_squeeze_excitation1_conv2d_bias)/sizeof(float), 1, 1,
                                  "Bias for squeeze 1 layer #11");
  set_allocate_copy_array_Device(&MBConv6_10_SQZ_1_bias, MBConv6_10_squeeze_excitation1_conv2d_bias,
                                  sizeof(MBConv6_10_squeeze_excitation1_conv2d_bias)/sizeof(float), 1, 1,
                                  "Bias for squeeze 1 layer #10");  
  set_allocate_copy_array_Device(&MBConv6_9_SQZ_1_bias, MBConv6_9_squeeze_excitation1_conv2d_bias,
                                  sizeof(MBConv6_9_squeeze_excitation1_conv2d_bias)/sizeof(float), 1, 1,
                                  "Bias for squeeze 1 layer #9");
  set_allocate_copy_array_Device(&MBConv6_8_SQZ_1_bias, MBConv6_8_squeeze_excitation1_conv2d_bias,
                                  sizeof(MBConv6_8_squeeze_excitation1_conv2d_bias)/sizeof(float), 1, 1,
                                  "Bias for squeeze 1 layer #8");
  set_allocate_copy_array_Device(&MBConv6_7_SQZ_1_bias, MBConv6_7_squeeze_excitation1_conv2d_bias,
                                  sizeof(MBConv6_7_squeeze_excitation1_conv2d_bias)/sizeof(float), 1, 1,
                                  "Bias for squeeze 1 layer #7");
  set_allocate_copy_array_Device(&MBConv6_6_SQZ_1_bias, MBConv6_6_squeeze_excitation1_conv2d_bias,
                                  sizeof(MBConv6_6_squeeze_excitation1_conv2d_bias)/sizeof(float), 1, 1,
                                  "Bias for squeeze 1 layer #6");
  set_allocate_copy_array_Device(&MBConv6_5_SQZ_1_bias, MBConv6_5_squeeze_excitation1_conv2d_bias,
                                  sizeof(MBConv6_5_squeeze_excitation1_conv2d_bias)/sizeof(float), 1, 1,
                                  "Bias for squeeze 1 layer #5");  
  set_allocate_copy_array_Device(&MBConv6_4_SQZ_1_bias, MBConv6_4_squeeze_excitation1_conv2d_bias,
                                  sizeof(MBConv6_4_squeeze_excitation1_conv2d_bias)/sizeof(float), 1, 1,
                                  "Bias for squeeze 1 layer #4");
  set_allocate_copy_array_Device(&MBConv6_3_SQZ_1_bias, MBConv6_3_squeeze_excitation1_conv2d_bias,
                                  sizeof(MBConv6_3_squeeze_excitation1_conv2d_bias)/sizeof(float), 1, 1,
                                  "Bias for squeeze 1 layer #3");
  set_allocate_copy_array_Device(&MBConv6_2_SQZ_1_bias, MBConv6_2_squeeze_excitation1_conv2d_bias,
                                  sizeof(MBConv6_2_squeeze_excitation1_conv2d_bias)/sizeof(float), 1, 1,
                                  "Bias for squeeze 1 layer #2");
  set_allocate_copy_array_Device(&MBConv6_1_SQZ_1_bias, MBConv6_1_squeeze_excitation1_conv2d_bias,
                                  sizeof(MBConv6_1_squeeze_excitation1_conv2d_bias)/sizeof(float), 1, 1,
                                  "Bias for squeeze 1 layer #1");
  set_allocate_copy_array_Device(&MBConv1_0_SQZ_1_bias, MBConv1_0_squeeze_excitation1_conv2d_bias,
                                  sizeof(MBConv1_0_squeeze_excitation1_conv2d_bias)/sizeof(float), 1, 1,
                                  "Bias for squeeze 1 layer #0");   
  set_allocate_copy_array_Device(&MBConv6_15_SQZ_2_bias, MBConv6_15_squeeze_excitation2_conv2d_bias,
                                  sizeof(MBConv6_15_squeeze_excitation2_conv2d_bias)/sizeof(float), 1, 1,
                                  "Bias for squeeze 2 layer #15");  
  set_allocate_copy_array_Device(&MBConv6_14_SQZ_2_bias, MBConv6_14_squeeze_excitation2_conv2d_bias,
                                  sizeof(MBConv6_14_squeeze_excitation2_conv2d_bias)/sizeof(float), 1, 1,
                                  "Bias for squeeze 2 layer #14");
  set_allocate_copy_array_Device(&MBConv6_13_SQZ_2_bias, MBConv6_13_squeeze_excitation2_conv2d_bias,
                                  sizeof(MBConv6_13_squeeze_excitation2_conv2d_bias)/sizeof(float), 1, 1,
                                  "Bias for squeeze 2 layer #13");
  set_allocate_copy_array_Device(&MBConv6_12_SQZ_2_bias, MBConv6_12_squeeze_excitation2_conv2d_bias,
                                  sizeof(MBConv6_12_squeeze_excitation2_conv2d_bias)/sizeof(float), 1, 1,
                                  "Bias for squeeze 2 layer #12");
  set_allocate_copy_array_Device(&MBConv6_11_SQZ_2_bias, MBConv6_11_squeeze_excitation2_conv2d_bias,
                                  sizeof(MBConv6_11_squeeze_excitation2_conv2d_bias)/sizeof(float), 1, 1,
                                  "Bias for squeeze 2 layer #11");
  set_allocate_copy_array_Device(&MBConv6_10_SQZ_2_bias, MBConv6_10_squeeze_excitation2_conv2d_bias,
                                  sizeof(MBConv6_10_squeeze_excitation2_conv2d_bias)/sizeof(float), 1, 1,
                                  "Bias for squeeze 2 layer #10");  
  set_allocate_copy_array_Device(&MBConv6_9_SQZ_2_bias, MBConv6_9_squeeze_excitation2_conv2d_bias,
                                  sizeof(MBConv6_9_squeeze_excitation2_conv2d_bias)/sizeof(float), 1, 1,
                                  "Bias for squeeze 2 layer #9");
  set_allocate_copy_array_Device(&MBConv6_8_SQZ_2_bias, MBConv6_8_squeeze_excitation2_conv2d_bias,
                                  sizeof(MBConv6_8_squeeze_excitation2_conv2d_bias)/sizeof(float), 1, 1,
                                  "Bias for squeeze 2 layer #8");
  set_allocate_copy_array_Device(&MBConv6_7_SQZ_2_bias, MBConv6_7_squeeze_excitation2_conv2d_bias,
                                  sizeof(MBConv6_7_squeeze_excitation2_conv2d_bias)/sizeof(float), 1, 1,
                                  "Bias for squeeze 2 layer #7");
  set_allocate_copy_array_Device(&MBConv6_6_SQZ_2_bias, MBConv6_6_squeeze_excitation2_conv2d_bias,
                                  sizeof(MBConv6_6_squeeze_excitation2_conv2d_bias)/sizeof(float), 1, 1,
                                  "Bias for squeeze 2 layer #6");
  set_allocate_copy_array_Device(&MBConv6_5_SQZ_2_bias, MBConv6_5_squeeze_excitation2_conv2d_bias,
                                  sizeof(MBConv6_5_squeeze_excitation2_conv2d_bias)/sizeof(float), 1, 1,
                                  "Bias for squeeze 2 layer #5");  
  set_allocate_copy_array_Device(&MBConv6_4_SQZ_2_bias, MBConv6_4_squeeze_excitation2_conv2d_bias,
                                  sizeof(MBConv6_4_squeeze_excitation2_conv2d_bias)/sizeof(float), 1, 1,
                                  "Bias for squeeze 2 layer #4");
  set_allocate_copy_array_Device(&MBConv6_3_SQZ_2_bias, MBConv6_3_squeeze_excitation2_conv2d_bias,
                                  sizeof(MBConv6_3_squeeze_excitation2_conv2d_bias)/sizeof(float), 1, 1,
                                  "Bias for squeeze 2 layer #3");
  set_allocate_copy_array_Device(&MBConv6_2_SQZ_2_bias, MBConv6_2_squeeze_excitation2_conv2d_bias,
                                  sizeof(MBConv6_2_squeeze_excitation2_conv2d_bias)/sizeof(float), 1, 1,
                                  "Bias for squeeze 2 layer #2");
  set_allocate_copy_array_Device(&MBConv6_1_SQZ_2_bias, MBConv6_1_squeeze_excitation2_conv2d_bias,
                                  sizeof(MBConv6_1_squeeze_excitation2_conv2d_bias)/sizeof(float), 1, 1,
                                  "Bias for squeeze 2 layer #1");
  set_allocate_copy_array_Device(&MBConv1_0_SQZ_2_bias, MBConv1_0_squeeze_excitation2_conv2d_bias,
                                  sizeof(MBConv1_0_squeeze_excitation2_conv2d_bias)/sizeof(float), 1, 1,
                                  "Bias for squeeze 2 layer #0");    

// 3. Define BN mean,variance, weights and bias
MBCONV1_0_flag = 1;

DEFINE_FILTERS_FOR_MBCONV_BN(	
  &MBConv1_0_EXPD_BN_MEAN,      NULL, 0,
  &MBConv1_0_EXPD_BN_VARIANCE,	NULL, 0,
  &MBConv1_0_EXPD_BN_WEIGHTS,		NULL, 0,
  &MBConv1_0_EXPD_BN_BIAS,			NULL, 0,

  &MBConv1_0_DW_BN_MEAN,        MBConv1_0_depthwise_conv_BN_mean,		  sizeof(MBConv1_0_depthwise_conv_BN_mean) / sizeof(float), 		
  &MBConv1_0_DW_BN_VARIANCE,		MBConv1_0_depthwise_conv_BN_variance,	sizeof(MBConv1_0_depthwise_conv_BN_variance) / sizeof(float),
  &MBConv1_0_DW_BN_WEIGHTS,     MBConv1_0_depthwise_conv_BN_weights,	sizeof(MBConv1_0_depthwise_conv_BN_weights) / sizeof(float),
  &MBConv1_0_DW_BN_BIAS,				MBConv1_0_depthwise_conv_BN_bias,		  sizeof(MBConv1_0_depthwise_conv_BN_bias) / sizeof(float),

  &MBConv1_0_PRJ_BN_MEAN,       MBConv1_0_project_conv_BN_mean,			  sizeof(MBConv1_0_project_conv_BN_mean) / sizeof(float),
  &MBConv1_0_PRJ_BN_VARIANCE,		MBConv1_0_project_conv_BN_variance,		sizeof(MBConv1_0_project_conv_BN_variance) / sizeof(float),
  &MBConv1_0_PRJ_BN_WEIGHTS,    MBConv1_0_project_conv_BN_weights,		sizeof(MBConv1_0_project_conv_BN_weights) / sizeof(float),
  &MBConv1_0_PRJ_BN_BIAS,				MBConv1_0_project_conv_BN_bias, 		  sizeof(MBConv1_0_project_conv_BN_bias) / sizeof(float));

MBCONV1_0_flag = 0;

DEFINE_FILTERS_FOR_MBCONV_BN(	
  &MBConv6_1_EXPD_BN_MEAN,      MBConv6_1_expansion_conv_BN_mean,		  sizeof(MBConv6_1_expansion_conv_BN_mean) / sizeof(float),
  &MBConv6_1_EXPD_BN_VARIANCE,	MBConv6_1_expansion_conv_BN_variance,	sizeof(MBConv6_1_expansion_conv_BN_variance) / sizeof(float),
  &MBConv6_1_EXPD_BN_WEIGHTS,   MBConv6_1_expansion_conv_BN_weights,	sizeof(MBConv6_1_expansion_conv_BN_weights) / sizeof(float),
  &MBConv6_1_EXPD_BN_BIAS,			MBConv6_1_expansion_conv_BN_bias,		  sizeof(MBConv6_1_expansion_conv_BN_bias) / sizeof(float),

  &MBConv6_1_DW_BN_MEAN,        MBConv6_1_depthwise_conv_BN_mean,		  sizeof(MBConv6_1_depthwise_conv_BN_mean) / sizeof(float),
  &MBConv6_1_DW_BN_VARIANCE,		MBConv6_1_depthwise_conv_BN_variance,	sizeof(MBConv6_1_depthwise_conv_BN_variance) / sizeof(float),	
  &MBConv6_1_DW_BN_WEIGHTS,     MBConv6_1_depthwise_conv_BN_weights,	sizeof(MBConv6_1_depthwise_conv_BN_weights) / sizeof(float),
  &MBConv6_1_DW_BN_BIAS,				MBConv6_1_depthwise_conv_BN_bias,		  sizeof(MBConv6_1_depthwise_conv_BN_bias) / sizeof(float),

  &MBConv6_1_PRJ_BN_MEAN,       MBConv6_1_project_conv_BN_mean,			  sizeof(MBConv6_1_project_conv_BN_mean) / sizeof(float),
  &MBConv6_1_PRJ_BN_VARIANCE,		MBConv6_1_project_conv_BN_variance,		sizeof(MBConv6_1_project_conv_BN_variance) / sizeof(float),
  &MBConv6_1_PRJ_BN_WEIGHTS,    MBConv6_1_project_conv_BN_weights,		sizeof(MBConv6_1_project_conv_BN_weights) / sizeof(float),
  &MBConv6_1_PRJ_BN_BIAS,				MBConv6_1_project_conv_BN_bias, 		  sizeof(MBConv6_1_project_conv_BN_bias) / sizeof(float));

DEFINE_FILTERS_FOR_MBCONV_BN(	
  &MBConv6_2_EXPD_BN_MEAN,      MBConv6_2_expansion_conv_BN_mean,		  sizeof(MBConv6_2_expansion_conv_BN_mean) / sizeof(float),
  &MBConv6_2_EXPD_BN_VARIANCE,	MBConv6_2_expansion_conv_BN_variance,	sizeof(MBConv6_2_expansion_conv_BN_variance) / sizeof(float),
  &MBConv6_2_EXPD_BN_WEIGHTS,   MBConv6_2_expansion_conv_BN_weights,	sizeof(MBConv6_2_expansion_conv_BN_weights) / sizeof(float),
  &MBConv6_2_EXPD_BN_BIAS,			MBConv6_2_expansion_conv_BN_bias,		  sizeof(MBConv6_2_expansion_conv_BN_bias) / sizeof(float),
  &MBConv6_2_DW_BN_MEAN,        MBConv6_2_depthwise_conv_BN_mean,		  sizeof(MBConv6_2_depthwise_conv_BN_mean) / sizeof(float),
  &MBConv6_2_DW_BN_VARIANCE,		MBConv6_2_depthwise_conv_BN_variance,	sizeof(MBConv6_2_depthwise_conv_BN_variance) / sizeof(float),	
  &MBConv6_2_DW_BN_WEIGHTS,     MBConv6_2_depthwise_conv_BN_weights,	sizeof(MBConv6_2_depthwise_conv_BN_weights) / sizeof(float),
  &MBConv6_2_DW_BN_BIAS,			  MBConv6_2_depthwise_conv_BN_bias,		  sizeof(MBConv6_2_depthwise_conv_BN_bias) / sizeof(float),
  &MBConv6_2_PRJ_BN_MEAN,       MBConv6_2_project_conv_BN_mean,			  sizeof(MBConv6_2_project_conv_BN_mean) / sizeof(float),
  &MBConv6_2_PRJ_BN_VARIANCE,		MBConv6_2_project_conv_BN_variance,		sizeof(MBConv6_2_project_conv_BN_variance) / sizeof(float),
  &MBConv6_2_PRJ_BN_WEIGHTS,    MBConv6_2_project_conv_BN_weights,		sizeof(MBConv6_2_project_conv_BN_weights) / sizeof(float),
  &MBConv6_2_PRJ_BN_BIAS,				MBConv6_2_project_conv_BN_bias, 		  sizeof(MBConv6_2_project_conv_BN_bias) / sizeof(float));

DEFINE_FILTERS_FOR_MBCONV_BN(	
  &MBConv6_3_EXPD_BN_MEAN,      MBConv6_3_expansion_conv_BN_mean, 		sizeof(MBConv6_3_expansion_conv_BN_mean) / sizeof(float),
  &MBConv6_3_EXPD_BN_VARIANCE,	MBConv6_3_expansion_conv_BN_variance,	sizeof(MBConv6_3_expansion_conv_BN_variance) / sizeof(float),
  &MBConv6_3_EXPD_BN_WEIGHTS,   MBConv6_3_expansion_conv_BN_weights,	sizeof(MBConv6_3_expansion_conv_BN_weights) / sizeof(float),
  &MBConv6_3_EXPD_BN_BIAS,			MBConv6_3_expansion_conv_BN_bias,		  sizeof(MBConv6_3_expansion_conv_BN_bias) / sizeof(float),
  &MBConv6_3_DW_BN_MEAN,        MBConv6_3_depthwise_conv_BN_mean,		  sizeof(MBConv6_3_depthwise_conv_BN_mean) / sizeof(float),
  &MBConv6_3_DW_BN_VARIANCE,		MBConv6_3_depthwise_conv_BN_variance,	sizeof(MBConv6_3_depthwise_conv_BN_variance) / sizeof(float),	
  &MBConv6_3_DW_BN_WEIGHTS,     MBConv6_3_depthwise_conv_BN_weights,	sizeof(MBConv6_3_depthwise_conv_BN_weights) / sizeof(float),
  &MBConv6_3_DW_BN_BIAS,				MBConv6_3_depthwise_conv_BN_bias,		  sizeof(MBConv6_3_depthwise_conv_BN_bias) / sizeof(float),
  &MBConv6_3_PRJ_BN_MEAN,       MBConv6_3_project_conv_BN_mean,			  sizeof(MBConv6_3_project_conv_BN_mean) / sizeof(float),
  &MBConv6_3_PRJ_BN_VARIANCE,		MBConv6_3_project_conv_BN_variance,		sizeof(MBConv6_3_project_conv_BN_variance) / sizeof(float),
  &MBConv6_3_PRJ_BN_WEIGHTS,    MBConv6_3_project_conv_BN_weights,		sizeof(MBConv6_3_project_conv_BN_weights) / sizeof(float),
  &MBConv6_3_PRJ_BN_BIAS,				MBConv6_3_project_conv_BN_bias, 		  sizeof(MBConv6_3_project_conv_BN_bias) / sizeof(float));

DEFINE_FILTERS_FOR_MBCONV_BN(	
  &MBConv6_4_EXPD_BN_MEAN,      MBConv6_4_expansion_conv_BN_mean, 		sizeof(MBConv6_4_expansion_conv_BN_mean) / sizeof(float),
  &MBConv6_4_EXPD_BN_VARIANCE,	MBConv6_4_expansion_conv_BN_variance,	sizeof(MBConv6_4_expansion_conv_BN_variance) / sizeof(float),
  &MBConv6_4_EXPD_BN_WEIGHTS,   MBConv6_4_expansion_conv_BN_weights,	sizeof(MBConv6_4_expansion_conv_BN_weights) / sizeof(float),
  &MBConv6_4_EXPD_BN_BIAS,			MBConv6_4_expansion_conv_BN_bias,		  sizeof(MBConv6_4_expansion_conv_BN_bias) / sizeof(float),
  &MBConv6_4_DW_BN_MEAN,        MBConv6_4_depthwise_conv_BN_mean,		  sizeof(MBConv6_4_depthwise_conv_BN_mean) / sizeof(float),
  &MBConv6_4_DW_BN_VARIANCE,		MBConv6_4_depthwise_conv_BN_variance,	sizeof(MBConv6_4_depthwise_conv_BN_variance) / sizeof(float),	
  &MBConv6_4_DW_BN_WEIGHTS,     MBConv6_4_depthwise_conv_BN_weights,	sizeof(MBConv6_4_depthwise_conv_BN_weights) / sizeof(float),
  &MBConv6_4_DW_BN_BIAS,				MBConv6_4_depthwise_conv_BN_bias,		  sizeof(MBConv6_4_depthwise_conv_BN_bias) / sizeof(float),
  &MBConv6_4_PRJ_BN_MEAN,       MBConv6_4_project_conv_BN_mean,			  sizeof(MBConv6_4_project_conv_BN_mean) / sizeof(float),
  &MBConv6_4_PRJ_BN_VARIANCE,		MBConv6_4_project_conv_BN_variance,		sizeof(MBConv6_4_project_conv_BN_variance) / sizeof(float),
  &MBConv6_4_PRJ_BN_WEIGHTS,    MBConv6_4_project_conv_BN_weights,		sizeof(MBConv6_4_project_conv_BN_weights) / sizeof(float),
  &MBConv6_4_PRJ_BN_BIAS,				MBConv6_4_project_conv_BN_bias, 		  sizeof(MBConv6_4_project_conv_BN_bias) / sizeof(float));

DEFINE_FILTERS_FOR_MBCONV_BN(	
  &MBConv6_5_EXPD_BN_MEAN,      MBConv6_5_expansion_conv_BN_mean,		  sizeof(MBConv6_5_expansion_conv_BN_mean) / sizeof(float),
  &MBConv6_5_EXPD_BN_VARIANCE,	MBConv6_5_expansion_conv_BN_variance,	sizeof(MBConv6_5_expansion_conv_BN_variance) / sizeof(float),
  &MBConv6_5_EXPD_BN_WEIGHTS,   MBConv6_5_expansion_conv_BN_weights,	sizeof(MBConv6_5_expansion_conv_BN_weights) / sizeof(float),
  &MBConv6_5_EXPD_BN_BIAS,			MBConv6_5_expansion_conv_BN_bias,		  sizeof(MBConv6_5_expansion_conv_BN_bias) / sizeof(float),
  &MBConv6_5_DW_BN_MEAN,        MBConv6_5_depthwise_conv_BN_mean,		  sizeof(MBConv6_5_depthwise_conv_BN_mean) / sizeof(float),
  &MBConv6_5_DW_BN_VARIANCE,		MBConv6_5_depthwise_conv_BN_variance,	sizeof(MBConv6_5_depthwise_conv_BN_variance) / sizeof(float),	
  &MBConv6_5_DW_BN_WEIGHTS,     MBConv6_5_depthwise_conv_BN_weights,	sizeof(MBConv6_5_depthwise_conv_BN_weights) / sizeof(float),
  &MBConv6_5_DW_BN_BIAS,				MBConv6_5_depthwise_conv_BN_bias,		  sizeof(MBConv6_5_depthwise_conv_BN_bias) / sizeof(float),
  &MBConv6_5_PRJ_BN_MEAN,       MBConv6_5_project_conv_BN_mean,			  sizeof(MBConv6_5_project_conv_BN_mean) / sizeof(float),
  &MBConv6_5_PRJ_BN_VARIANCE,		MBConv6_5_project_conv_BN_variance,		sizeof(MBConv6_5_project_conv_BN_variance) / sizeof(float),
  &MBConv6_5_PRJ_BN_WEIGHTS,    MBConv6_5_project_conv_BN_weights,		sizeof(MBConv6_5_project_conv_BN_weights) / sizeof(float),
  &MBConv6_5_PRJ_BN_BIAS,				MBConv6_5_project_conv_BN_bias, 		  sizeof(MBConv6_5_project_conv_BN_bias) / sizeof(float));

DEFINE_FILTERS_FOR_MBCONV_BN(	
  &MBConv6_6_EXPD_BN_MEAN,      MBConv6_6_expansion_conv_BN_mean,		  sizeof(MBConv6_6_expansion_conv_BN_mean) / sizeof(float),
  &MBConv6_6_EXPD_BN_VARIANCE,	MBConv6_6_expansion_conv_BN_variance,	sizeof(MBConv6_6_expansion_conv_BN_variance) / sizeof(float),
  &MBConv6_6_EXPD_BN_WEIGHTS,   MBConv6_6_expansion_conv_BN_weights,	sizeof(MBConv6_6_expansion_conv_BN_weights) / sizeof(float),
  &MBConv6_6_EXPD_BN_BIAS,			MBConv6_6_expansion_conv_BN_bias,		  sizeof(MBConv6_6_expansion_conv_BN_bias) / sizeof(float),
  &MBConv6_6_DW_BN_MEAN,        MBConv6_6_depthwise_conv_BN_mean,		  sizeof(MBConv6_6_depthwise_conv_BN_mean) / sizeof(float),
  &MBConv6_6_DW_BN_VARIANCE,		MBConv6_6_depthwise_conv_BN_variance,	sizeof(MBConv6_6_depthwise_conv_BN_variance) / sizeof(float),	
  &MBConv6_6_DW_BN_WEIGHTS,     MBConv6_6_depthwise_conv_BN_weights,	sizeof(MBConv6_6_depthwise_conv_BN_weights) / sizeof(float),
  &MBConv6_6_DW_BN_BIAS,				MBConv6_6_depthwise_conv_BN_bias,		  sizeof(MBConv6_6_depthwise_conv_BN_bias) / sizeof(float),
  &MBConv6_6_PRJ_BN_MEAN,       MBConv6_6_project_conv_BN_mean,			  sizeof(MBConv6_6_project_conv_BN_mean) / sizeof(float),
  &MBConv6_6_PRJ_BN_VARIANCE,		MBConv6_6_project_conv_BN_variance,		sizeof(MBConv6_6_project_conv_BN_variance) / sizeof(float),
  &MBConv6_6_PRJ_BN_WEIGHTS,    MBConv6_6_project_conv_BN_weights,		sizeof(MBConv6_6_project_conv_BN_weights) / sizeof(float),
  &MBConv6_6_PRJ_BN_BIAS,				MBConv6_6_project_conv_BN_bias, 		  sizeof(MBConv6_6_project_conv_BN_bias) / sizeof(float));

DEFINE_FILTERS_FOR_MBCONV_BN(	
  &MBConv6_7_EXPD_BN_MEAN,      MBConv6_7_expansion_conv_BN_mean,		  sizeof(MBConv6_7_expansion_conv_BN_mean) / sizeof(float),
  &MBConv6_7_EXPD_BN_VARIANCE,	MBConv6_7_expansion_conv_BN_variance,	sizeof(MBConv6_7_expansion_conv_BN_variance) / sizeof(float),
  &MBConv6_7_EXPD_BN_WEIGHTS,   MBConv6_7_expansion_conv_BN_weights,	sizeof(MBConv6_7_expansion_conv_BN_weights) / sizeof(float),
  &MBConv6_7_EXPD_BN_BIAS,			MBConv6_7_expansion_conv_BN_bias,		  sizeof(MBConv6_7_expansion_conv_BN_bias) / sizeof(float),
  &MBConv6_7_DW_BN_MEAN,        MBConv6_7_depthwise_conv_BN_mean,		  sizeof(MBConv6_7_depthwise_conv_BN_mean) / sizeof(float),
  &MBConv6_7_DW_BN_VARIANCE,		MBConv6_7_depthwise_conv_BN_variance,	sizeof(MBConv6_7_depthwise_conv_BN_variance) / sizeof(float),	
  &MBConv6_7_DW_BN_WEIGHTS,     MBConv6_7_depthwise_conv_BN_weights,	sizeof(MBConv6_7_depthwise_conv_BN_weights) / sizeof(float),
  &MBConv6_7_DW_BN_BIAS,				MBConv6_7_depthwise_conv_BN_bias,		  sizeof(MBConv6_7_depthwise_conv_BN_bias) / sizeof(float),
  &MBConv6_7_PRJ_BN_MEAN,       MBConv6_7_project_conv_BN_mean,			  sizeof(MBConv6_7_project_conv_BN_mean) / sizeof(float),
  &MBConv6_7_PRJ_BN_VARIANCE,		MBConv6_7_project_conv_BN_variance,		sizeof(MBConv6_7_project_conv_BN_variance) / sizeof(float),
  &MBConv6_7_PRJ_BN_WEIGHTS,    MBConv6_7_project_conv_BN_weights,		sizeof(MBConv6_7_project_conv_BN_weights) / sizeof(float),
  &MBConv6_7_PRJ_BN_BIAS,				MBConv6_7_project_conv_BN_bias, 		  sizeof(MBConv6_7_project_conv_BN_bias) / sizeof(float));

DEFINE_FILTERS_FOR_MBCONV_BN(	
  &MBConv6_8_EXPD_BN_MEAN,      MBConv6_8_expansion_conv_BN_mean,		  sizeof(MBConv6_8_expansion_conv_BN_mean) / sizeof(float),
  &MBConv6_8_EXPD_BN_VARIANCE,	MBConv6_8_expansion_conv_BN_variance,	sizeof(MBConv6_8_expansion_conv_BN_variance) / sizeof(float),
  &MBConv6_8_EXPD_BN_WEIGHTS,   MBConv6_8_expansion_conv_BN_weights,	sizeof(MBConv6_8_expansion_conv_BN_weights) / sizeof(float),
  &MBConv6_8_EXPD_BN_BIAS,			MBConv6_8_expansion_conv_BN_bias,		  sizeof(MBConv6_8_expansion_conv_BN_bias) / sizeof(float),
  &MBConv6_8_DW_BN_MEAN,        MBConv6_8_depthwise_conv_BN_mean,		  sizeof(MBConv6_8_depthwise_conv_BN_mean) / sizeof(float),
  &MBConv6_8_DW_BN_VARIANCE,		MBConv6_8_depthwise_conv_BN_variance,	sizeof(MBConv6_8_depthwise_conv_BN_variance) / sizeof(float),	
  &MBConv6_8_DW_BN_WEIGHTS,     MBConv6_8_depthwise_conv_BN_weights,	sizeof(MBConv6_8_depthwise_conv_BN_weights) / sizeof(float),
  &MBConv6_8_DW_BN_BIAS,				MBConv6_8_depthwise_conv_BN_bias,		  sizeof(MBConv6_8_depthwise_conv_BN_bias) / sizeof(float),
  &MBConv6_8_PRJ_BN_MEAN,       MBConv6_8_project_conv_BN_mean,			  sizeof(MBConv6_8_project_conv_BN_mean) / sizeof(float),
  &MBConv6_8_PRJ_BN_VARIANCE,		MBConv6_8_project_conv_BN_variance,		sizeof(MBConv6_8_project_conv_BN_variance) / sizeof(float),
  &MBConv6_8_PRJ_BN_WEIGHTS,    MBConv6_8_project_conv_BN_weights,		sizeof(MBConv6_8_project_conv_BN_weights) / sizeof(float),
  &MBConv6_8_PRJ_BN_BIAS,				MBConv6_8_project_conv_BN_bias, 		  sizeof(MBConv6_8_project_conv_BN_bias) / sizeof(float));

DEFINE_FILTERS_FOR_MBCONV_BN(	
  &MBConv6_9_EXPD_BN_MEAN,      MBConv6_9_expansion_conv_BN_mean,		  sizeof(MBConv6_9_expansion_conv_BN_mean) / sizeof(float),
  &MBConv6_9_EXPD_BN_VARIANCE,	MBConv6_9_expansion_conv_BN_variance,	sizeof(MBConv6_9_expansion_conv_BN_variance) / sizeof(float),
  &MBConv6_9_EXPD_BN_WEIGHTS,   MBConv6_9_expansion_conv_BN_weights,	sizeof(MBConv6_9_expansion_conv_BN_weights) / sizeof(float),
  &MBConv6_9_EXPD_BN_BIAS,			MBConv6_9_expansion_conv_BN_bias,		  sizeof(MBConv6_9_expansion_conv_BN_bias) / sizeof(float),
  &MBConv6_9_DW_BN_MEAN,        MBConv6_9_depthwise_conv_BN_mean,		  sizeof(MBConv6_9_depthwise_conv_BN_mean) / sizeof(float),
  &MBConv6_9_DW_BN_VARIANCE,		MBConv6_9_depthwise_conv_BN_variance,	sizeof(MBConv6_9_depthwise_conv_BN_variance) / sizeof(float),	
  &MBConv6_9_DW_BN_WEIGHTS,     MBConv6_9_depthwise_conv_BN_weights,	sizeof(MBConv6_9_depthwise_conv_BN_weights) / sizeof(float),
  &MBConv6_9_DW_BN_BIAS,				MBConv6_9_depthwise_conv_BN_bias,		  sizeof(MBConv6_9_depthwise_conv_BN_bias) / sizeof(float),
  &MBConv6_9_PRJ_BN_MEAN,       MBConv6_9_project_conv_BN_mean,			  sizeof(MBConv6_9_project_conv_BN_mean) / sizeof(float),
  &MBConv6_9_PRJ_BN_VARIANCE,		MBConv6_9_project_conv_BN_variance,		sizeof(MBConv6_9_project_conv_BN_variance) / sizeof(float),
  &MBConv6_9_PRJ_BN_WEIGHTS,    MBConv6_9_project_conv_BN_weights,		sizeof(MBConv6_9_project_conv_BN_weights) / sizeof(float),
  &MBConv6_9_PRJ_BN_BIAS,				MBConv6_9_project_conv_BN_bias, 		  sizeof(MBConv6_9_project_conv_BN_bias) / sizeof(float));

DEFINE_FILTERS_FOR_MBCONV_BN(	
  &MBConv6_10_EXPD_BN_MEAN,     MBConv6_10_expansion_conv_BN_mean,    sizeof(MBConv6_10_expansion_conv_BN_mean) / sizeof(float),
  &MBConv6_10_EXPD_BN_VARIANCE,	MBConv6_10_expansion_conv_BN_variance,sizeof(MBConv6_10_expansion_conv_BN_variance) / sizeof(float),
  &MBConv6_10_EXPD_BN_WEIGHTS,  MBConv6_10_expansion_conv_BN_weights,	sizeof(MBConv6_10_expansion_conv_BN_weights) / sizeof(float),
  &MBConv6_10_EXPD_BN_BIAS,			MBConv6_10_expansion_conv_BN_bias,		sizeof(MBConv6_10_expansion_conv_BN_bias) / sizeof(float),
  &MBConv6_10_DW_BN_MEAN,       MBConv6_10_depthwise_conv_BN_mean,		sizeof(MBConv6_10_depthwise_conv_BN_mean) / sizeof(float),
  &MBConv6_10_DW_BN_VARIANCE,		MBConv6_10_depthwise_conv_BN_variance,sizeof(MBConv6_10_depthwise_conv_BN_variance) / sizeof(float),	
  &MBConv6_10_DW_BN_WEIGHTS,    MBConv6_10_depthwise_conv_BN_weights,	sizeof(MBConv6_10_depthwise_conv_BN_weights) / sizeof(float),
  &MBConv6_10_DW_BN_BIAS,				MBConv6_10_depthwise_conv_BN_bias,		sizeof(MBConv6_10_depthwise_conv_BN_bias) / sizeof(float),
  &MBConv6_10_PRJ_BN_MEAN,      MBConv6_10_project_conv_BN_mean,		  sizeof(MBConv6_10_project_conv_BN_mean) / sizeof(float),
  &MBConv6_10_PRJ_BN_VARIANCE,	MBConv6_10_project_conv_BN_variance,	sizeof(MBConv6_10_project_conv_BN_variance) / sizeof(float),
  &MBConv6_10_PRJ_BN_WEIGHTS,   MBConv6_10_project_conv_BN_weights,		sizeof(MBConv6_10_project_conv_BN_weights) / sizeof(float),
  &MBConv6_10_PRJ_BN_BIAS,			MBConv6_10_project_conv_BN_bias, 		  sizeof(MBConv6_10_project_conv_BN_bias) / sizeof(float));

DEFINE_FILTERS_FOR_MBCONV_BN(	
  &MBConv6_11_EXPD_BN_MEAN,     MBConv6_11_expansion_conv_BN_mean,		sizeof(MBConv6_11_expansion_conv_BN_mean) / sizeof(float),
  &MBConv6_11_EXPD_BN_VARIANCE,	MBConv6_11_expansion_conv_BN_variance,sizeof(MBConv6_11_expansion_conv_BN_variance) / sizeof(float),
  &MBConv6_11_EXPD_BN_WEIGHTS,  MBConv6_11_expansion_conv_BN_weights,	sizeof(MBConv6_11_expansion_conv_BN_weights) / sizeof(float),
  &MBConv6_11_EXPD_BN_BIAS,			MBConv6_11_expansion_conv_BN_bias,		sizeof(MBConv6_11_expansion_conv_BN_bias) / sizeof(float),
  &MBConv6_11_DW_BN_MEAN,       MBConv6_11_depthwise_conv_BN_mean,		sizeof(MBConv6_11_depthwise_conv_BN_mean) / sizeof(float),
  &MBConv6_11_DW_BN_VARIANCE,		MBConv6_11_depthwise_conv_BN_variance,sizeof(MBConv6_11_depthwise_conv_BN_variance) / sizeof(float),	
  &MBConv6_11_DW_BN_WEIGHTS,    MBConv6_11_depthwise_conv_BN_weights,	sizeof(MBConv6_11_depthwise_conv_BN_weights) / sizeof(float),
  &MBConv6_11_DW_BN_BIAS,				MBConv6_11_depthwise_conv_BN_bias,		sizeof(MBConv6_11_depthwise_conv_BN_bias) / sizeof(float),
  &MBConv6_11_PRJ_BN_MEAN,      MBConv6_11_project_conv_BN_mean,		  sizeof(MBConv6_11_project_conv_BN_mean) / sizeof(float),
  &MBConv6_11_PRJ_BN_VARIANCE,	MBConv6_11_project_conv_BN_variance,	sizeof(MBConv6_11_project_conv_BN_variance) / sizeof(float),
  &MBConv6_11_PRJ_BN_WEIGHTS,   MBConv6_11_project_conv_BN_weights,		sizeof(MBConv6_11_project_conv_BN_weights) / sizeof(float),
  &MBConv6_11_PRJ_BN_BIAS,			MBConv6_11_project_conv_BN_bias, 		  sizeof(MBConv6_11_project_conv_BN_bias) / sizeof(float));

  DEFINE_FILTERS_FOR_MBCONV_BN(	
  &MBConv6_12_EXPD_BN_MEAN,     MBConv6_12_expansion_conv_BN_mean,		sizeof(MBConv6_12_expansion_conv_BN_mean) / sizeof(float),
  &MBConv6_12_EXPD_BN_VARIANCE,	MBConv6_12_expansion_conv_BN_variance,sizeof(MBConv6_12_expansion_conv_BN_variance) / sizeof(float),
  &MBConv6_12_EXPD_BN_WEIGHTS,  MBConv6_12_expansion_conv_BN_weights,	sizeof(MBConv6_12_expansion_conv_BN_weights) / sizeof(float),
  &MBConv6_12_EXPD_BN_BIAS,			MBConv6_12_expansion_conv_BN_bias,		sizeof(MBConv6_12_expansion_conv_BN_bias) / sizeof(float),
  &MBConv6_12_DW_BN_MEAN,       MBConv6_12_depthwise_conv_BN_mean,		sizeof(MBConv6_12_depthwise_conv_BN_mean) / sizeof(float),
  &MBConv6_12_DW_BN_VARIANCE,		MBConv6_12_depthwise_conv_BN_variance,sizeof(MBConv6_12_depthwise_conv_BN_variance) / sizeof(float),	
  &MBConv6_12_DW_BN_WEIGHTS,    MBConv6_12_depthwise_conv_BN_weights,	sizeof(MBConv6_12_depthwise_conv_BN_weights) / sizeof(float),
  &MBConv6_12_DW_BN_BIAS,				MBConv6_12_depthwise_conv_BN_bias,		sizeof(MBConv6_12_depthwise_conv_BN_bias) / sizeof(float),
  &MBConv6_12_PRJ_BN_MEAN,      MBConv6_12_project_conv_BN_mean,		  sizeof(MBConv6_12_project_conv_BN_mean) / sizeof(float),
  &MBConv6_12_PRJ_BN_VARIANCE,	MBConv6_12_project_conv_BN_variance,	sizeof(MBConv6_12_project_conv_BN_variance) / sizeof(float),
  &MBConv6_12_PRJ_BN_WEIGHTS,   MBConv6_12_project_conv_BN_weights,		sizeof(MBConv6_12_project_conv_BN_weights) / sizeof(float),
  &MBConv6_12_PRJ_BN_BIAS,			MBConv6_12_project_conv_BN_bias, 		  sizeof(MBConv6_12_project_conv_BN_bias) / sizeof(float));

DEFINE_FILTERS_FOR_MBCONV_BN(	
  &MBConv6_13_EXPD_BN_MEAN,     MBConv6_13_expansion_conv_BN_mean,		sizeof(MBConv6_13_expansion_conv_BN_mean) / sizeof(float),
  &MBConv6_13_EXPD_BN_VARIANCE,	MBConv6_13_expansion_conv_BN_variance,sizeof(MBConv6_13_expansion_conv_BN_variance) / sizeof(float),
  &MBConv6_13_EXPD_BN_WEIGHTS,  MBConv6_13_expansion_conv_BN_weights,	sizeof(MBConv6_13_expansion_conv_BN_weights) / sizeof(float),
  &MBConv6_13_EXPD_BN_BIAS,			MBConv6_13_expansion_conv_BN_bias,		sizeof(MBConv6_13_expansion_conv_BN_bias) / sizeof(float),
  &MBConv6_13_DW_BN_MEAN,       MBConv6_13_depthwise_conv_BN_mean,		sizeof(MBConv6_13_depthwise_conv_BN_mean) / sizeof(float),
  &MBConv6_13_DW_BN_VARIANCE,		MBConv6_13_depthwise_conv_BN_variance,sizeof(MBConv6_13_depthwise_conv_BN_variance) / sizeof(float),	
  &MBConv6_13_DW_BN_WEIGHTS,    MBConv6_13_depthwise_conv_BN_weights,	sizeof(MBConv6_13_depthwise_conv_BN_weights) / sizeof(float),
  &MBConv6_13_DW_BN_BIAS,				MBConv6_13_depthwise_conv_BN_bias,		sizeof(MBConv6_13_depthwise_conv_BN_bias) / sizeof(float),
  &MBConv6_13_PRJ_BN_MEAN,      MBConv6_13_project_conv_BN_mean,		  sizeof(MBConv6_13_project_conv_BN_mean) / sizeof(float),
  &MBConv6_13_PRJ_BN_VARIANCE,	MBConv6_13_project_conv_BN_variance,	sizeof(MBConv6_13_project_conv_BN_variance) / sizeof(float),
  &MBConv6_13_PRJ_BN_WEIGHTS,   MBConv6_13_project_conv_BN_weights,		sizeof(MBConv6_13_project_conv_BN_weights) / sizeof(float),
  &MBConv6_13_PRJ_BN_BIAS,			MBConv6_13_project_conv_BN_bias, 		  sizeof(MBConv6_13_project_conv_BN_bias) / sizeof(float));

DEFINE_FILTERS_FOR_MBCONV_BN(	
  &MBConv6_14_EXPD_BN_MEAN,     MBConv6_14_expansion_conv_BN_mean,		sizeof(MBConv6_14_expansion_conv_BN_mean) / sizeof(float),
  &MBConv6_14_EXPD_BN_VARIANCE,	MBConv6_14_expansion_conv_BN_variance,sizeof(MBConv6_14_expansion_conv_BN_variance) / sizeof(float),
  &MBConv6_14_EXPD_BN_WEIGHTS,  MBConv6_14_expansion_conv_BN_weights,	sizeof(MBConv6_14_expansion_conv_BN_weights) / sizeof(float),
  &MBConv6_14_EXPD_BN_BIAS,			MBConv6_14_expansion_conv_BN_bias,		sizeof(MBConv6_14_expansion_conv_BN_bias) / sizeof(float),
  &MBConv6_14_DW_BN_MEAN,       MBConv6_14_depthwise_conv_BN_mean,		sizeof(MBConv6_14_depthwise_conv_BN_mean) / sizeof(float),
  &MBConv6_14_DW_BN_VARIANCE,		MBConv6_14_depthwise_conv_BN_variance,sizeof(MBConv6_14_depthwise_conv_BN_variance) / sizeof(float),	
  &MBConv6_14_DW_BN_WEIGHTS,    MBConv6_14_depthwise_conv_BN_weights,	sizeof(MBConv6_14_depthwise_conv_BN_weights) / sizeof(float),
  &MBConv6_14_DW_BN_BIAS,				MBConv6_14_depthwise_conv_BN_bias,		sizeof(MBConv6_14_depthwise_conv_BN_bias) / sizeof(float),
  &MBConv6_14_PRJ_BN_MEAN,      MBConv6_14_project_conv_BN_mean,		  sizeof(MBConv6_14_project_conv_BN_mean) / sizeof(float),
  &MBConv6_14_PRJ_BN_VARIANCE,	MBConv6_14_project_conv_BN_variance,	sizeof(MBConv6_14_project_conv_BN_variance) / sizeof(float),
  &MBConv6_14_PRJ_BN_WEIGHTS,   MBConv6_14_project_conv_BN_weights,		sizeof(MBConv6_14_project_conv_BN_weights) / sizeof(float),
  &MBConv6_14_PRJ_BN_BIAS,			MBConv6_14_project_conv_BN_bias, 		  sizeof(MBConv6_14_project_conv_BN_bias) / sizeof(float));

DEFINE_FILTERS_FOR_MBCONV_BN(	
  &MBConv6_15_EXPD_BN_MEAN,     MBConv6_15_expansion_conv_BN_mean,		sizeof(MBConv6_15_expansion_conv_BN_mean) / sizeof(float),
  &MBConv6_15_EXPD_BN_VARIANCE,	MBConv6_15_expansion_conv_BN_variance,sizeof(MBConv6_15_expansion_conv_BN_variance) / sizeof(float),
  &MBConv6_15_EXPD_BN_WEIGHTS,  MBConv6_15_expansion_conv_BN_weights,	sizeof(MBConv6_15_expansion_conv_BN_weights) / sizeof(float),
  &MBConv6_15_EXPD_BN_BIAS,			MBConv6_15_expansion_conv_BN_bias,		sizeof(MBConv6_15_expansion_conv_BN_bias) / sizeof(float),
  &MBConv6_15_DW_BN_MEAN,       MBConv6_15_depthwise_conv_BN_mean,		sizeof(MBConv6_15_depthwise_conv_BN_mean) / sizeof(float),
  &MBConv6_15_DW_BN_VARIANCE,		MBConv6_15_depthwise_conv_BN_variance,sizeof(MBConv6_15_depthwise_conv_BN_variance) / sizeof(float),	
  &MBConv6_15_DW_BN_WEIGHTS,    MBConv6_15_depthwise_conv_BN_weights,	sizeof(MBConv6_15_depthwise_conv_BN_weights) / sizeof(float),
  &MBConv6_15_DW_BN_BIAS,				MBConv6_15_depthwise_conv_BN_bias,		sizeof(MBConv6_15_depthwise_conv_BN_bias) / sizeof(float),
  &MBConv6_15_PRJ_BN_MEAN,      MBConv6_15_project_conv_BN_mean,		  sizeof(MBConv6_15_project_conv_BN_mean) / sizeof(float),
  &MBConv6_15_PRJ_BN_VARIANCE,	MBConv6_15_project_conv_BN_variance,	sizeof(MBConv6_15_project_conv_BN_variance) / sizeof(float),
  &MBConv6_15_PRJ_BN_WEIGHTS,   MBConv6_15_project_conv_BN_weights,		sizeof(MBConv6_15_project_conv_BN_weights) / sizeof(float),
  &MBConv6_15_PRJ_BN_BIAS,			MBConv6_15_project_conv_BN_bias, 		  sizeof(MBConv6_15_project_conv_BN_bias) / sizeof(float));


set_allocate_copy_array_Device(&D_STEM_BN_MEAN, Stem_BN_mean,
                sizeof(Stem_BN_mean)/sizeof(float), 1, 1,
                "STEM MEAN"); 
set_allocate_copy_array_Device(&D_STEM_BN_VARIANCE, Stem_BN_variance,
                sizeof(Stem_BN_variance)/sizeof(float), 1, 1,
                "STEAM VARIANCE"); 
set_allocate_copy_array_Device(&D_STEM_BN_WEIGHTS, Stem_BN_weights,
                sizeof(Stem_BN_weights)/sizeof(float), 1, 1,
                "STEM WEIGHTS"); 
set_allocate_copy_array_Device(&D_STEM_BN_BIAS, Stem_BN_bias,
                sizeof(Stem_BN_bias)/sizeof(float), 1, 1,
                "STEM BIAS"); 
                
set_allocate_copy_array_Device(&D_HEAD_BN_MEAN, Head_BN_mean,
                sizeof(Head_BN_mean)/sizeof(float), 1, 1,
                "HEAD MEAN"); 
set_allocate_copy_array_Device(&D_HEAD_BN_VARIANCE, Head_BN_variance,
                sizeof(Head_BN_variance)/sizeof(float), 1, 1,
                "HEAD VARIANCE"); 
set_allocate_copy_array_Device(&D_HEAD_BN_WEIGHTS, Head_BN_weights,
                sizeof(Head_BN_weights)/sizeof(float), 1, 1,
                "HEAD WEIGHTS"); 
set_allocate_copy_array_Device(&D_HEAD_BN_BIAS, Head_BN_bias,
                sizeof(Head_BN_bias)/sizeof(float), 1, 1,
                "HEAD BIAS"); 
start();
  // 3. Move through all layers starting from stem layer till head layer
  Matrix ConvOutStem;
  STEM_LAYER(&DInput_Mat, &F_STEM,
              INPUT_IMAGE_HEIGHT, INPUT_IMAGE_WIDTH, INPUT_IMAGE_DEPTH,
              STEM_FILTER_HEIGHT, STEM_FILTER_WIDTH, STEM_FILTER_DEPTH, 
              STEM_FILTER_DENSITY,STEM_PADDING,      STEM_STRIDE,
              &ConvOutStem);


  Matrix ConvOut1_0;
  MBCONV1_0_flag = 1;  

  MBConv_Layer(&ConvOutStem, &ConvOut1_0,
                &D_MBConv_1_0_EXPD_WEIGHTS, &D_MBConv_1_0_DW_WEIGHTS,
                &D_MBConv_1_0_SQZ_1_WEIGHTS,&D_MBConv_1_0_SQZ_2_WEIGHTS,
                &D_MBConv_1_0_PRJ_WEIGHTS,
                MBCONV_1_0_EXPD_F_DENSITY,  MBCONV_1_0_DW_F_DENSITY, 
                MBCONV_1_0_SQZ_1_F_DENSITY, MBCONV_1_0_SQZ_2_F_DENSITY, 
                MBCONV_1_0_PRJ_F_DENSITY,
                ConvOutStem.depth,          MBCONV_1_0_PRJ_F_DENSITY, MBCONV_1_0_DW_F_HEIGHT,
                MBCONV_1_0_STRIDE,          MBCONV_1_0_PADDING, MBCONV_1_0_SKIP,
                &MBConv1_0_SQZ_1_bias, 	    &MBConv1_0_SQZ_2_bias,
                NULL,                       NULL,
                NULL,                       NULL,
                &MBConv1_0_DW_BN_MEAN,      &MBConv1_0_DW_BN_VARIANCE,
                &MBConv1_0_DW_BN_WEIGHTS,   &MBConv1_0_DW_BN_BIAS,
                &MBConv1_0_PRJ_BN_MEAN,     &MBConv1_0_PRJ_BN_VARIANCE,
                &MBConv1_0_PRJ_BN_WEIGHTS,  &MBConv1_0_PRJ_BN_BIAS);
  MBCONV1_0_flag = 0;
  

  Matrix ConvOut;
  MBConv_Layer(&ConvOut1_0, &ConvOut,
                &D_MBConv_6_1_EXPD_WEIGHTS, &D_MBConv_6_1_DW_WEIGHTS,
                &D_MBConv_6_1_SQZ_1_WEIGHTS,&D_MBConv_6_1_SQZ_2_WEIGHTS,
                &D_MBConv_6_1_PRJ_WEIGHTS,
                MBCONV_6_1_EXPD_F_DENSITY,  MBCONV_6_1_DW_F_DENSITY, 
                MBCONV_6_1_SQZ_1_F_DENSITY, MBCONV_6_1_SQZ_2_F_DENSITY, 
                MBCONV_6_1_PRJ_F_DENSITY,
                ConvOut1_0.depth,           MBCONV_6_1_PRJ_F_DENSITY, MBCONV_6_1_DW_F_HEIGHT,
                MBCONV_6_1_STRIDE,          MBCONV_6_1_PADDING, MBCONV_6_1_SKIP,
                &MBConv6_1_SQZ_1_bias, 	    &MBConv6_1_SQZ_2_bias,
                &MBConv6_1_EXPD_BN_MEAN,    &MBConv6_1_EXPD_BN_VARIANCE,
                &MBConv6_1_EXPD_BN_WEIGHTS, &MBConv6_1_EXPD_BN_BIAS,
                &MBConv6_1_DW_BN_MEAN,      &MBConv6_1_DW_BN_VARIANCE,
                &MBConv6_1_DW_BN_WEIGHTS,   &MBConv6_1_DW_BN_BIAS,
                &MBConv6_1_PRJ_BN_MEAN,     &MBConv6_1_PRJ_BN_VARIANCE,
                &MBConv6_1_PRJ_BN_WEIGHTS,  &MBConv6_1_PRJ_BN_BIAS);


  Matrix ConvOut2;
  MBConv_Layer(&ConvOut, &ConvOut2,
                &D_MBConv_6_2_EXPD_WEIGHTS, &D_MBConv_6_2_DW_WEIGHTS,
                &D_MBConv_6_2_SQZ_1_WEIGHTS,&D_MBConv_6_2_SQZ_2_WEIGHTS,
                &D_MBConv_6_2_PRJ_WEIGHTS,
                MBCONV_6_2_EXPD_F_DENSITY,  MBCONV_6_2_DW_F_DENSITY, 
                MBCONV_6_2_SQZ_1_F_DENSITY, MBCONV_6_2_SQZ_2_F_DENSITY, 
                MBCONV_6_2_PRJ_F_DENSITY,
                ConvOut.depth,              MBCONV_6_2_PRJ_F_DENSITY, MBCONV_6_2_DW_F_HEIGHT,
                MBCONV_6_2_STRIDE,          MBCONV_6_2_PADDING, MBCONV_6_2_SKIP,
                &MBConv6_2_SQZ_1_bias, 	    &MBConv6_2_SQZ_2_bias,
                &MBConv6_2_EXPD_BN_MEAN,    &MBConv6_2_EXPD_BN_VARIANCE,
                &MBConv6_2_EXPD_BN_WEIGHTS, &MBConv6_2_EXPD_BN_BIAS,
                &MBConv6_2_DW_BN_MEAN,      &MBConv6_2_DW_BN_VARIANCE,
                &MBConv6_2_DW_BN_WEIGHTS,   &MBConv6_2_DW_BN_BIAS,
                &MBConv6_2_PRJ_BN_MEAN,     &MBConv6_2_PRJ_BN_VARIANCE,
                &MBConv6_2_PRJ_BN_WEIGHTS,  &MBConv6_2_PRJ_BN_BIAS); 


  Matrix ConvOut3;
	MBConv_Layer(&ConvOut2, &ConvOut3,
                &D_MBConv_6_3_EXPD_WEIGHTS, &D_MBConv_6_3_DW_WEIGHTS,
                &D_MBConv_6_3_SQZ_1_WEIGHTS,&D_MBConv_6_3_SQZ_2_WEIGHTS,
                &D_MBConv_6_3_PRJ_WEIGHTS,
                MBCONV_6_3_EXPD_F_DENSITY,  MBCONV_6_3_DW_F_DENSITY, 
                MBCONV_6_3_SQZ_1_F_DENSITY, MBCONV_6_3_SQZ_2_F_DENSITY, 
                MBCONV_6_3_PRJ_F_DENSITY,
                ConvOut2.depth,             MBCONV_6_3_PRJ_F_DENSITY, MBCONV_6_3_DW_F_HEIGHT,
                MBCONV_6_3_STRIDE,          MBCONV_6_3_PADDING, MBCONV_6_3_SKIP,
                &MBConv6_3_SQZ_1_bias,  	  &MBConv6_3_SQZ_2_bias,
                &MBConv6_3_EXPD_BN_MEAN,    &MBConv6_3_EXPD_BN_VARIANCE,
                &MBConv6_3_EXPD_BN_WEIGHTS, &MBConv6_3_EXPD_BN_BIAS,
                &MBConv6_3_DW_BN_MEAN,      &MBConv6_3_DW_BN_VARIANCE,
                &MBConv6_3_DW_BN_WEIGHTS,   &MBConv6_3_DW_BN_BIAS,
                &MBConv6_3_PRJ_BN_MEAN,     &MBConv6_3_PRJ_BN_VARIANCE,
                &MBConv6_3_PRJ_BN_WEIGHTS,  &MBConv6_3_PRJ_BN_BIAS);  
 

  // MBConv6_4 layer implementation

  Matrix ConvOut4;
  MBConv_Layer(&ConvOut3, &ConvOut4,
                &D_MBConv_6_4_EXPD_WEIGHTS, &D_MBConv_6_4_DW_WEIGHTS,
                &D_MBConv_6_4_SQZ_1_WEIGHTS,&D_MBConv_6_4_SQZ_2_WEIGHTS,
                &D_MBConv_6_4_PRJ_WEIGHTS,
                MBCONV_6_4_EXPD_F_DENSITY,  MBCONV_6_4_DW_F_DENSITY, 
                MBCONV_6_4_SQZ_1_F_DENSITY, MBCONV_6_4_SQZ_2_F_DENSITY, 
                MBCONV_6_4_PRJ_F_DENSITY,
                ConvOut3.depth,             MBCONV_6_4_PRJ_F_DENSITY, MBCONV_6_4_DW_F_HEIGHT,
                MBCONV_6_4_STRIDE,          MBCONV_6_4_PADDING, MBCONV_6_4_SKIP,
                &MBConv6_4_SQZ_1_bias,  	  &MBConv6_4_SQZ_2_bias,
                &MBConv6_4_EXPD_BN_MEAN,    &MBConv6_4_EXPD_BN_VARIANCE,
                &MBConv6_4_EXPD_BN_WEIGHTS, &MBConv6_4_EXPD_BN_BIAS,
                &MBConv6_4_DW_BN_MEAN,      &MBConv6_4_DW_BN_VARIANCE,
                &MBConv6_4_DW_BN_WEIGHTS,   &MBConv6_4_DW_BN_BIAS,
                &MBConv6_4_PRJ_BN_MEAN,     &MBConv6_4_PRJ_BN_VARIANCE,
                &MBConv6_4_PRJ_BN_WEIGHTS,  &MBConv6_4_PRJ_BN_BIAS);   
  

  Matrix ConvOut5;
  MBConv_Layer(&ConvOut4, &ConvOut5,
                &D_MBConv_6_5_EXPD_WEIGHTS, &D_MBConv_6_5_DW_WEIGHTS,
                &D_MBConv_6_5_SQZ_1_WEIGHTS,&D_MBConv_6_5_SQZ_2_WEIGHTS,
                &D_MBConv_6_5_PRJ_WEIGHTS,
                MBCONV_6_5_EXPD_F_DENSITY,  MBCONV_6_5_DW_F_DENSITY, 
                MBCONV_6_5_SQZ_1_F_DENSITY, MBCONV_6_5_SQZ_2_F_DENSITY, 
                MBCONV_6_5_PRJ_F_DENSITY,
                ConvOut4.depth,             MBCONV_6_5_PRJ_F_DENSITY, MBCONV_6_5_DW_F_HEIGHT,
                MBCONV_6_5_STRIDE,          MBCONV_6_5_PADDING, MBCONV_6_5_SKIP,
                &MBConv6_5_SQZ_1_bias,  	  &MBConv6_5_SQZ_2_bias,
                &MBConv6_5_EXPD_BN_MEAN,    &MBConv6_5_EXPD_BN_VARIANCE,
                &MBConv6_5_EXPD_BN_WEIGHTS, &MBConv6_5_EXPD_BN_BIAS,
                &MBConv6_5_DW_BN_MEAN,      &MBConv6_5_DW_BN_VARIANCE,
                &MBConv6_5_DW_BN_WEIGHTS,   &MBConv6_5_DW_BN_BIAS,
                &MBConv6_5_PRJ_BN_MEAN,     &MBConv6_5_PRJ_BN_VARIANCE,
                &MBConv6_5_PRJ_BN_WEIGHTS,  &MBConv6_5_PRJ_BN_BIAS); 
            


  // MBConv6_6 layer implementation


  Matrix ConvOut6;
  MBConv_Layer(&ConvOut5, &ConvOut6,
                &D_MBConv_6_6_EXPD_WEIGHTS, &D_MBConv_6_6_DW_WEIGHTS,
                &D_MBConv_6_6_SQZ_1_WEIGHTS,&D_MBConv_6_6_SQZ_2_WEIGHTS,
                &D_MBConv_6_6_PRJ_WEIGHTS,
                MBCONV_6_6_EXPD_F_DENSITY,  MBCONV_6_6_DW_F_DENSITY, 
                MBCONV_6_6_SQZ_1_F_DENSITY, MBCONV_6_6_SQZ_2_F_DENSITY, 
                MBCONV_6_6_PRJ_F_DENSITY,
                ConvOut5.depth,             MBCONV_6_6_PRJ_F_DENSITY, MBCONV_6_6_DW_F_HEIGHT,
                MBCONV_6_6_STRIDE,          MBCONV_6_6_PADDING, MBCONV_6_6_SKIP,
                &MBConv6_6_SQZ_1_bias, 	    &MBConv6_6_SQZ_2_bias,
                &MBConv6_6_EXPD_BN_MEAN,    &MBConv6_6_EXPD_BN_VARIANCE,
                &MBConv6_6_EXPD_BN_WEIGHTS, &MBConv6_6_EXPD_BN_BIAS,
                &MBConv6_6_DW_BN_MEAN,      &MBConv6_6_DW_BN_VARIANCE,
                &MBConv6_6_DW_BN_WEIGHTS,   &MBConv6_6_DW_BN_BIAS,
                &MBConv6_6_PRJ_BN_MEAN,     &MBConv6_6_PRJ_BN_VARIANCE,
                &MBConv6_6_PRJ_BN_WEIGHTS,  &MBConv6_6_PRJ_BN_BIAS);  
            


  // MBConv6_7 layer implementation


  Matrix ConvOut7;
  MBConv_Layer(&ConvOut6, &ConvOut7,
                &D_MBConv_6_7_EXPD_WEIGHTS, &D_MBConv_6_7_DW_WEIGHTS,
                &D_MBConv_6_7_SQZ_1_WEIGHTS,&D_MBConv_6_7_SQZ_2_WEIGHTS,
                &D_MBConv_6_7_PRJ_WEIGHTS,
                MBCONV_6_7_EXPD_F_DENSITY,  MBCONV_6_7_DW_F_DENSITY, 
                MBCONV_6_7_SQZ_1_F_DENSITY, MBCONV_6_7_SQZ_2_F_DENSITY, 
                MBCONV_6_7_PRJ_F_DENSITY,
                ConvOut6.depth,             MBCONV_6_7_PRJ_F_DENSITY, MBCONV_6_7_DW_F_HEIGHT,                   
                MBCONV_6_7_STRIDE,          MBCONV_6_7_PADDING, MBCONV_6_7_SKIP,
                &MBConv6_7_SQZ_1_bias,  	  &MBConv6_7_SQZ_2_bias,
                &MBConv6_7_EXPD_BN_MEAN,    &MBConv6_7_EXPD_BN_VARIANCE,
                &MBConv6_7_EXPD_BN_WEIGHTS, &MBConv6_7_EXPD_BN_BIAS,
                &MBConv6_7_DW_BN_MEAN,      &MBConv6_7_DW_BN_VARIANCE,
                &MBConv6_7_DW_BN_WEIGHTS,   &MBConv6_7_DW_BN_BIAS,
                &MBConv6_7_PRJ_BN_MEAN,     &MBConv6_7_PRJ_BN_VARIANCE,
                &MBConv6_7_PRJ_BN_WEIGHTS,  &MBConv6_7_PRJ_BN_BIAS);  
          


  // MBConv6_8 layer implementation
  Matrix ConvOut8;
  MBConv_Layer(&ConvOut7, &ConvOut8,
                &D_MBConv_6_8_EXPD_WEIGHTS, &D_MBConv_6_8_DW_WEIGHTS,
                &D_MBConv_6_8_SQZ_1_WEIGHTS,&D_MBConv_6_8_SQZ_2_WEIGHTS,
                &D_MBConv_6_8_PRJ_WEIGHTS,
                MBCONV_6_8_EXPD_F_DENSITY,  MBCONV_6_8_DW_F_DENSITY, 
                MBCONV_6_8_SQZ_1_F_DENSITY, MBCONV_6_8_SQZ_2_F_DENSITY, 
                MBCONV_6_8_PRJ_F_DENSITY,
                ConvOut7.depth,             MBCONV_6_8_PRJ_F_DENSITY, MBCONV_6_8_DW_F_HEIGHT,    
                MBCONV_6_8_STRIDE,          MBCONV_6_8_PADDING, MBCONV_6_8_SKIP,
                &MBConv6_8_SQZ_1_bias,      &MBConv6_8_SQZ_2_bias,
                &MBConv6_8_EXPD_BN_MEAN,    &MBConv6_8_EXPD_BN_VARIANCE,
                &MBConv6_8_EXPD_BN_WEIGHTS, &MBConv6_8_EXPD_BN_BIAS,
                &MBConv6_8_DW_BN_MEAN,      &MBConv6_8_DW_BN_VARIANCE,
                &MBConv6_8_DW_BN_WEIGHTS,   &MBConv6_8_DW_BN_BIAS,
                &MBConv6_8_PRJ_BN_MEAN,     &MBConv6_8_PRJ_BN_VARIANCE,
                &MBConv6_8_PRJ_BN_WEIGHTS,  &MBConv6_8_PRJ_BN_BIAS); 
        


  // MBConv6_9 layer implementation
  Matrix ConvOut9;
  MBConv_Layer(&ConvOut8, &ConvOut9,
                &D_MBConv_6_9_EXPD_WEIGHTS, &D_MBConv_6_9_DW_WEIGHTS,
                &D_MBConv_6_9_SQZ_1_WEIGHTS,&D_MBConv_6_9_SQZ_2_WEIGHTS,
                &D_MBConv_6_9_PRJ_WEIGHTS,
                MBCONV_6_9_EXPD_F_DENSITY,  MBCONV_6_9_DW_F_DENSITY, 
                MBCONV_6_9_SQZ_1_F_DENSITY, MBCONV_6_9_SQZ_2_F_DENSITY, 
                MBCONV_6_9_PRJ_F_DENSITY,
                ConvOut8.depth,             MBCONV_6_9_PRJ_F_DENSITY, MBCONV_6_9_DW_F_HEIGHT,
                MBCONV_6_9_STRIDE,          MBCONV_6_9_PADDING, MBCONV_6_9_SKIP,
                &MBConv6_9_SQZ_1_bias,  	  &MBConv6_9_SQZ_2_bias,
                &MBConv6_9_EXPD_BN_MEAN,    &MBConv6_9_EXPD_BN_VARIANCE,
                &MBConv6_9_EXPD_BN_WEIGHTS, &MBConv6_9_EXPD_BN_BIAS,
                &MBConv6_9_DW_BN_MEAN,      &MBConv6_9_DW_BN_VARIANCE,
                &MBConv6_9_DW_BN_WEIGHTS,   &MBConv6_9_DW_BN_BIAS,
                &MBConv6_9_PRJ_BN_MEAN,     &MBConv6_9_PRJ_BN_VARIANCE,
                &MBConv6_9_PRJ_BN_WEIGHTS,  &MBConv6_9_PRJ_BN_BIAS);  				  



  // MBConv6_10 layer implementation
  Matrix ConvOut10;
	MBConv_Layer(&ConvOut9, &ConvOut10,
                &D_MBConv_6_10_EXPD_WEIGHTS,  &D_MBConv_6_10_DW_WEIGHTS,
                &D_MBConv_6_10_SQZ_1_WEIGHTS, &D_MBConv_6_10_SQZ_2_WEIGHTS,
                &D_MBConv_6_10_PRJ_WEIGHTS,
                MBCONV_6_10_EXPD_F_DENSITY,   MBCONV_6_10_DW_F_DENSITY, 
                MBCONV_6_10_SQZ_1_F_DENSITY,  MBCONV_6_10_SQZ_2_F_DENSITY, 
                MBCONV_6_10_PRJ_F_DENSITY,
                ConvOut9.depth,               MBCONV_6_10_PRJ_F_DENSITY, MBCONV_6_10_DW_F_HEIGHT,
                MBCONV_6_10_STRIDE,           MBCONV_6_10_PADDING, MBCONV_6_10_SKIP,
                &MBConv6_10_SQZ_1_bias, 	    &MBConv6_10_SQZ_2_bias,
                &MBConv6_10_EXPD_BN_MEAN,     &MBConv6_10_EXPD_BN_VARIANCE,
                &MBConv6_10_EXPD_BN_WEIGHTS,  &MBConv6_10_EXPD_BN_BIAS,
                &MBConv6_10_DW_BN_MEAN,       &MBConv6_10_DW_BN_VARIANCE,
                &MBConv6_10_DW_BN_WEIGHTS,    &MBConv6_10_DW_BN_BIAS,
                &MBConv6_10_PRJ_BN_MEAN,      &MBConv6_10_PRJ_BN_VARIANCE,
                &MBConv6_10_PRJ_BN_WEIGHTS,   &MBConv6_10_PRJ_BN_BIAS);   
  


  // MBConv6_11 layer implementation


  Matrix ConvOut11;
  MBConv_Layer(&ConvOut10, &ConvOut11,
                &D_MBConv_6_11_EXPD_WEIGHTS,  &D_MBConv_6_11_DW_WEIGHTS,
                &D_MBConv_6_11_SQZ_1_WEIGHTS, &D_MBConv_6_11_SQZ_2_WEIGHTS,
                &D_MBConv_6_11_PRJ_WEIGHTS,
                MBCONV_6_11_EXPD_F_DENSITY,   MBCONV_6_11_DW_F_DENSITY, 
                MBCONV_6_11_SQZ_1_F_DENSITY,  MBCONV_6_11_SQZ_2_F_DENSITY, 
                MBCONV_6_11_PRJ_F_DENSITY,  
                ConvOut10.depth,              MBCONV_6_11_PRJ_F_DENSITY, MBCONV_6_11_DW_F_HEIGHT,
                MBCONV_6_11_STRIDE,           MBCONV_6_11_PADDING, MBCONV_6_11_SKIP,
                &MBConv6_11_SQZ_1_bias,       &MBConv6_11_SQZ_2_bias,
                &MBConv6_11_EXPD_BN_MEAN,     &MBConv6_11_EXPD_BN_VARIANCE,
                &MBConv6_11_EXPD_BN_WEIGHTS,  &MBConv6_11_EXPD_BN_BIAS,
                &MBConv6_11_DW_BN_MEAN,       &MBConv6_11_DW_BN_VARIANCE,
                &MBConv6_11_DW_BN_WEIGHTS,    &MBConv6_11_DW_BN_BIAS,
                &MBConv6_11_PRJ_BN_MEAN,      &MBConv6_11_PRJ_BN_VARIANCE,
                &MBConv6_11_PRJ_BN_WEIGHTS,   &MBConv6_11_PRJ_BN_BIAS);  
  


  // MBConv6_12 layer implementation


  Matrix ConvOut12;
  MBConv_Layer(&ConvOut11, &ConvOut12,
                &D_MBConv_6_12_EXPD_WEIGHTS,  &D_MBConv_6_12_DW_WEIGHTS,
                &D_MBConv_6_12_SQZ_1_WEIGHTS, &D_MBConv_6_12_SQZ_2_WEIGHTS,
                &D_MBConv_6_12_PRJ_WEIGHTS,
                MBCONV_6_12_EXPD_F_DENSITY,   MBCONV_6_12_DW_F_DENSITY, 
                MBCONV_6_12_SQZ_1_F_DENSITY,  MBCONV_6_12_SQZ_2_F_DENSITY, 
                MBCONV_6_12_PRJ_F_DENSITY,
                ConvOut11.depth,              MBCONV_6_12_PRJ_F_DENSITY, MBCONV_6_12_DW_F_HEIGHT,
                MBCONV_6_12_STRIDE,           MBCONV_6_12_PADDING, MBCONV_6_12_SKIP,
                &MBConv6_12_SQZ_1_bias,       &MBConv6_12_SQZ_2_bias,
                &MBConv6_12_EXPD_BN_MEAN,     &MBConv6_12_EXPD_BN_VARIANCE,
                &MBConv6_12_EXPD_BN_WEIGHTS,  &MBConv6_12_EXPD_BN_BIAS,
                &MBConv6_12_DW_BN_MEAN,       &MBConv6_12_DW_BN_VARIANCE,
                &MBConv6_12_DW_BN_WEIGHTS,    &MBConv6_12_DW_BN_BIAS,
                &MBConv6_12_PRJ_BN_MEAN,      &MBConv6_12_PRJ_BN_VARIANCE,
                &MBConv6_12_PRJ_BN_WEIGHTS,   &MBConv6_12_PRJ_BN_BIAS);   
  


  // MBConv6_13 layer implementation

  Matrix ConvOut13;
  MBConv_Layer(&ConvOut12, &ConvOut13,
                &D_MBConv_6_13_EXPD_WEIGHTS,  &D_MBConv_6_13_DW_WEIGHTS,
                &D_MBConv_6_13_SQZ_1_WEIGHTS, &D_MBConv_6_13_SQZ_2_WEIGHTS,
                &D_MBConv_6_13_PRJ_WEIGHTS,
                MBCONV_6_13_EXPD_F_DENSITY,   MBCONV_6_13_DW_F_DENSITY, 
                MBCONV_6_13_SQZ_1_F_DENSITY,  MBCONV_6_13_SQZ_2_F_DENSITY, 
                MBCONV_6_13_PRJ_F_DENSITY,
                ConvOut12.depth,              MBCONV_6_13_PRJ_F_DENSITY, MBCONV_6_13_DW_F_HEIGHT,
                MBCONV_6_13_STRIDE,           MBCONV_6_13_PADDING, MBCONV_6_13_SKIP,
                &MBConv6_13_SQZ_1_bias,       &MBConv6_13_SQZ_2_bias,
                &MBConv6_13_EXPD_BN_MEAN,     &MBConv6_13_EXPD_BN_VARIANCE,
                &MBConv6_13_EXPD_BN_WEIGHTS,  &MBConv6_13_EXPD_BN_BIAS,
                &MBConv6_13_DW_BN_MEAN,       &MBConv6_13_DW_BN_VARIANCE,
                &MBConv6_13_DW_BN_WEIGHTS,    &MBConv6_13_DW_BN_BIAS,
                &MBConv6_13_PRJ_BN_MEAN,      &MBConv6_13_PRJ_BN_VARIANCE,
                &MBConv6_13_PRJ_BN_WEIGHTS,   &MBConv6_13_PRJ_BN_BIAS);


  Matrix ConvOut14;
  MBConv_Layer(&ConvOut13, &ConvOut14,
                &D_MBConv_6_14_EXPD_WEIGHTS,  &D_MBConv_6_14_DW_WEIGHTS,
                &D_MBConv_6_14_SQZ_1_WEIGHTS, &D_MBConv_6_14_SQZ_2_WEIGHTS,
                &D_MBConv_6_14_PRJ_WEIGHTS,
                MBCONV_6_14_EXPD_F_DENSITY,   MBCONV_6_14_DW_F_DENSITY, 
                MBCONV_6_14_SQZ_1_F_DENSITY,  MBCONV_6_14_SQZ_2_F_DENSITY, 
                MBCONV_6_14_PRJ_F_DENSITY,
                ConvOut13.depth,              MBCONV_6_14_PRJ_F_DENSITY, MBCONV_6_14_DW_F_HEIGHT,
                MBCONV_6_14_STRIDE,           MBCONV_6_14_PADDING, MBCONV_6_14_SKIP,
                &MBConv6_14_SQZ_1_bias, 	    &MBConv6_14_SQZ_2_bias,
                &MBConv6_14_EXPD_BN_MEAN,     &MBConv6_14_EXPD_BN_VARIANCE,
                &MBConv6_14_EXPD_BN_WEIGHTS,  &MBConv6_14_EXPD_BN_BIAS,
                &MBConv6_14_DW_BN_MEAN,       &MBConv6_14_DW_BN_VARIANCE,
                &MBConv6_14_DW_BN_WEIGHTS,    &MBConv6_14_DW_BN_BIAS,
                &MBConv6_14_PRJ_BN_MEAN,      &MBConv6_14_PRJ_BN_VARIANCE,
                &MBConv6_14_PRJ_BN_WEIGHTS,   &MBConv6_14_PRJ_BN_BIAS);  


  Matrix ConvOut15;
  MBConv_Layer(&ConvOut14, &ConvOut15,
                &D_MBConv_6_15_EXPD_WEIGHTS,  &D_MBConv_6_15_DW_WEIGHTS,
                &D_MBConv_6_15_SQZ_1_WEIGHTS, &D_MBConv_6_15_SQZ_2_WEIGHTS,
                &D_MBConv_6_15_PRJ_WEIGHTS,
                MBCONV_6_15_EXPD_F_DENSITY,   MBCONV_6_15_DW_F_DENSITY, 
                MBCONV_6_15_SQZ_1_F_DENSITY,  MBCONV_6_15_SQZ_2_F_DENSITY, 
                MBCONV_6_15_PRJ_F_DENSITY,
                ConvOut14.depth,              MBCONV_6_15_PRJ_F_DENSITY, MBCONV_6_15_DW_F_HEIGHT,
                MBCONV_6_15_STRIDE,           MBCONV_6_15_PADDING, MBCONV_6_15_SKIP,
                &MBConv6_15_SQZ_1_bias,       &MBConv6_15_SQZ_2_bias,
                &MBConv6_15_EXPD_BN_MEAN,     &MBConv6_15_EXPD_BN_VARIANCE,
                &MBConv6_15_EXPD_BN_WEIGHTS,  &MBConv6_15_EXPD_BN_BIAS,
                &MBConv6_15_DW_BN_MEAN,       &MBConv6_15_DW_BN_VARIANCE,
                &MBConv6_15_DW_BN_WEIGHTS,    &MBConv6_15_DW_BN_BIAS,
                &MBConv6_15_PRJ_BN_MEAN,      &MBConv6_15_PRJ_BN_VARIANCE,
                &MBConv6_15_PRJ_BN_WEIGHTS,   &MBConv6_15_PRJ_BN_BIAS);   

  // Head layer
  Matrix HEAD_OUT;
  HEAD_LAYER(&ConvOut15, &HEAD_CONV_WEIGHTS, &HEAD_FC_WEIGHTS,
              HEAD_CONV_F_HEIGHT, HEAD_CONV_F_WIDTH, HEAD_CONV_F_DEPTH, HEAD_CONV_F_DENSITY,
              0, 1,
              &HEAD_OUT);
  
}



// The last layer in efficient net
void HEAD_LAYER(Matrix *INPUT_MATRIX, Matrix *F_HEAD, Matrix *FC_WEIGHTS,
                int filter_height, int filter_width, int filter_depth, int filter_density,
                int padding, int stride,
                Matrix *HEAD_OUT)
{                
  // Calculate output dimensions       
  int out_height = (INPUT_MATRIX -> height + 2 * padding - filter_height) / stride + 1;
  int out_width = (INPUT_MATRIX -> width + 2 * padding - filter_width) / stride + 1;
  int out_depth = filter_density;

  Set_DeviceMatrix(out_height, out_width, out_depth, HEAD_OUT,
                   "Output is allocated in device memory"); 

  // 1st 3 layers: Conv2d 1x1: BN: Swish()
  Conv2d_Layer(INPUT_MATRIX,  F_HEAD, HEAD_OUT,
              stride, padding,
              INPUT_MATRIX -> depth, out_depth, filter_density,
              Conv2d_1_x_1, NO_ACTIVATION,
              0, NULL);
 
  BN_ALL_PRE_DEFINED(HEAD_OUT, SWISH_ACTIVATION, 
                      &D_HEAD_BN_MEAN,	&D_HEAD_BN_VARIANCE ,
                      &D_HEAD_BN_WEIGHTS, &D_HEAD_BN_BIAS);


  // 4th layer: Average pooling layer which is just a reduction sum layer
  // Get mean values for all channels; Dims(1 x 1 x InputDepth)
  
  Matrix MEAN, Result_Mean;

  Set_DeviceMatrix(HEAD_OUT -> depth,
                    (int)ceil((double)HEAD_OUT -> height * HEAD_OUT -> width / (2 * BLOCK_SIZE)),
                    1, 
                    &Result_Mean, 
                    "Reesult Mean matrix allocated in device memory");

  REDUCTION_SUM(HEAD_OUT, &MEAN, &Result_Mean);


  // 5th layer: Fully connected layer::

  // Set Output matrix details
  Matrix Out1;
  Set_DeviceMatrix(1, 1000, 1, &Out1, "Setting Final Model Output matrix in device memory");
     
  Conv_vidMultiplier(&Out1, FC_WEIGHTS, &Result_Mean,
                      1, 1000, 1,
                      Conv2d_1_x_1, 1,
                      NO_ACTIVATION, 
                      0, NULL);
  
  stop("Model: ", 0);
  
  Matrix tmp_out_host;
  set_allocate_Host(&tmp_out_host, 1, 1000, 1);
  just_copy_DTH(&tmp_out_host, &Out1, "Copying to add bias");
 
  for (int i = 0; i < 1000; i++)
  {
    tmp_out_host.elements[i] += Head_linear_bias[i];
  }

  just_copy_HTD(&Out1, &tmp_out_host, "Copying to add bias");
  show_me_enhanced_from_devince(&Out1, "Model final output::");
}

// The first layer in efficient net: 
// It reutnrs a pointer to matrix, its elements are allocated in device memory 
void STEM_LAYER(Matrix *DInput_Mat, Matrix *F_STEM,
                  int image_height, int image_width, int image_depth,
                  int filter_height, int filter_width, int filter_depth, int filter_density,
                  int padding, int stride,
                  Matrix *STEM_OUT)
{

  // Calculate output dimensions       
  int out_height = (image_height + 2 * padding - filter_height) / stride + 1;
  int out_width = (image_width + 2 * padding - filter_width) / stride + 1;
  int out_depth = filter_density;
 

  // Allow the output from this layer to go accross the next layer       
  Set_DeviceMatrix(out_height, out_width, out_depth, STEM_OUT,
                   "Output is allocated in device memory"); 
 
  Conv2d_Layer(DInput_Mat,  F_STEM, STEM_OUT,
              stride, padding,
              image_depth, out_depth, filter_density,
              Regular_Conv, NO_ACTIVATION,
              0, NULL);
 

  BN_ALL_PRE_DEFINED(STEM_OUT, SWISH_ACTIVATION, 
                      &D_STEM_BN_MEAN, &D_STEM_BN_VARIANCE ,
                      &D_STEM_BN_WEIGHTS, &D_STEM_BN_BIAS);  
}

'File written in /content/src/APP.cu'

In [155]:
%%cuda --name FUNCTIONS.cu 

#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <math.h>
#include <cuda.h>
#include <cuda_runtime_api.h>
#include "device_launch_parameters.h"
#include <cusolverDn.h>
#include <cuda_runtime.h>


#include "/content/MBCONVS_float/functionsV2.h"
#include "/content/MBCONVS_float/KERNELSH.h"

static void HandleError( cudaError_t err,
                         char *file,
                         int line ) {
    if (err != cudaSuccess) {
        printf( "%s in %s at line %d\n", cudaGetErrorString( err ),
                file, line );
        exit( EXIT_FAILURE );
    }
}
#define HANDLE_ERROR( err ) (HandleError( err, __FILE__, __LINE__ ))

float time_defined = 0, tmp_time = 0, total_time_for_layer = 0;; 
cudaEvent_t start_timing, stop_timing;


int show_out = 0;

int total_constant_memory = 0;
                
// Device memory for filters
void DEFINE_FILTERS_FOR_MBCONV(Matrix *D_f1, float *filter1, int h1, int w1, int dens1,
                               Matrix *D_f2, float *filter2, int h2, int w2, int dens2,
                               Matrix *D_f3, float *filter3, int h3, int w3, int dens3,
                               Matrix *D_f4, float *filter4, int h4, int w4, int dens4,
                               Matrix *D_f5, float *filter5, int h5, int w5, int dens5)
{
    // Note: No allocations are done, just pointers point to matrices pre-defined

    // This condition is important as the float * is NULL
    if (MBCONV1_0_flag == 1);
    else
      set_allocate_copy_array_Device(D_f1, filter1,
                                    h1, w1, dens1,
                                    "1st filter allocated");
 
    set_allocate_copy_array_Device(D_f2, filter2,
                                    h2, w2, dens2,
                                    "2nd filter allocated");
 
    set_allocate_copy_array_Device(D_f3, filter3,
                                    h3, w3, dens3,
                                    "3rd filter allocated");

    set_allocate_copy_array_Device(D_f4, filter4,
                                    h4, w4, dens4,
                                    "4th filter allocated");

    set_allocate_copy_array_Device(D_f5, filter5,
                                    h5, w5, dens5,
                                    "5th filter allocated");                                                         
}

// Free the device filters
void FREE_FILTERS_FOR_MBCONV(Matrix *D_f1, Matrix *D_f2, 
                             Matrix *D_f3, Matrix *D_f4,
                             Matrix *D_f5)
{
  cudaFree(D_f1 -> elements);
  cudaFree(D_f2 -> elements);
  cudaFree(D_f3 -> elements);
  cudaFree(D_f4 -> elements);
  cudaFree(D_f5 -> elements);
}

void REDUCTION_SUM(Matrix* Output_Modified, Matrix *sum, Matrix *DMean)
{
    /*
      The mean will be a row vector of 1 x C;
      where C is number of original matrix channels
      All input matrices for this function are device matrices,
      except for sum, it's just a transition that later can be removed 
    */
    
    // Define number of blocks in different directions
    int nbx = 0;
    int nby = 0;
    int nbz = 1;

    size_t size;
    cudaError err;
 
    /*
      Load input Matrix inot device t calculate mean for it
      Unfortionately the code requires to copy the input matrix
    */
 
    Matrix DInputMat;
    // Allocate and set its dimensions as needed from the algorithm
    Set_DeviceMatrix(Output_Modified -> depth, Output_Modified -> height * Output_Modified -> width, 1,
                     &DInputMat, "Copyting Input of reduction mean function into Device memory");

    // Copy input from a device memory to device memory in order to process the elements
    size = DInputMat.height * DInputMat.width * DInputMat.depth * sizeof(float);
    err = cudaMemcpyAsync(DInputMat.elements, Output_Modified -> elements, size, cudaMemcpyDeviceToDevice, 0);
    CheckCudaError("Copying mean matrix elements from ", err);

    /* Starting reduction sum calculations */

    // Note: All blocks are 1D threads. We use Block.x to reduce 1 channel elements'
    // Diffferent block.y to address different number of channels
    nbx = (int)ceil((float)DInputMat.width / (2 * BLOCK_SIZE));
    nby = (int)ceil((float)DInputMat.height);

    if (nbx == 0) nbx = 1;
    if (nby == 0) nby = 1;

    // For loop is held to maintain huge number of summations needed
    for (int i = 0; DInputMat.width != 1; i++)
    {
        dim3 dim_Grid2(nbx, nby, nbz);
        dim3 dim_Block2(BLOCK_SIZE, 1, 1);

        // Make sure to synch between multiple runs
       //cudaDeviceSynchronize();
        
        BN_Kernel_Mean_Reduction <<< dim_Grid2, dim_Block2 >>> (DInputMat.elements,
                                                                DInputMat.height,
                                                                DInputMat.width,
                                                                DInputMat.depth,
                                                                DMean -> elements,
                                                                DMean -> width);

        
        // Save and copy mean values array into the filter array
        size = DMean -> height * DMean -> width * DMean -> depth * sizeof(float);
        err = cudaMemcpyAsync(DInputMat.elements, DMean -> elements, size, cudaMemcpyDeviceToDevice, 0);
        CheckCudaError("Copying mean matrix elements from ", err);

        // Modify filter width to fit into the new elements width
        DInputMat.width = nbx;
        DInputMat.height = nby;

        // Recalculate number of blocks in x direction
        nbx = (int)ceil((float)DInputMat.width / (2 * BLOCK_SIZE));
        nby = (int)ceil((float)DInputMat.height);

        if (nbx == 0) nbx = 1;
        if (nby == 0) nby = 1;

        // Set width of mean matrix to the new number of blocks
        DMean -> width = nbx;
        DMean -> height = nby;
    }
    
    // Set mean matrix to 1 X C X 1 to ease further calculations
    DMean -> height = 1; DMean -> width = Output_Modified -> depth; DMean -> depth = 1;
    
    nbx = (int)ceil((float)DMean -> width / 1024);
    
    dim3 dim_Grid2(nbx, 1, 1);
    dim3 dim_Block2(1024, 1, 1);
    CastingDivision <<<dim_Grid2, dim_Block2>>> (DMean -> elements, DMean -> width, 
                                                 Output_Modified->height * Output_Modified->width);
}


/*  Squeeze_and_Excite(&tmp2, &SE_OUT, F3, F4,
                      FD4, FD3, FD4, FD3);
                      */
// Note: input and output channels args represents the 2 Conv layers output channels respectively
void Squeeze_and_Excite(Matrix* InputIMG, Matrix* Result,
                        Matrix* Filter1, Matrix* Filter2,
                        int FilterDensity2, int FilterDensity1,
                        int input_channels, int output_channels,
                        Matrix * First_bias, Matrix *Second_bias)
{
    /*
       Steps in squeeze and excite layer:
        1. Get mean value for a tensor
        2. pass the mean to the covolution, swish, convolution, sigmoid
        3. the result will be a 1 x 1 x C, multiply elementwise.
          "each element in a channel is multiplied by the result's corresponding channel element"
        
        Filter Density means #filters used
 
      Note: All input matrices are device allocated matrices
    */

 
    /*
      Get mean values for all channels; Dims(1 x InputDepth x 1) 
      Note: Mean matrix is a host allocated memory in REDUCTION_SUM;
            It's used to get the final summation from device and
            then divide each element sequentially by total number 
            of elements. It's then later copied back to Result_Mean
            Matrix which is a device matrix.
            "This can be later changed"
    */
 
    Matrix MEAN, Result_Mean;

    Set_DeviceMatrix(InputIMG -> depth,
                      (int)ceil((double)InputIMG -> height * InputIMG -> width / (2 * BLOCK_SIZE)),
                      1, 
                      &Result_Mean, 
                      "Reesult Mean matrix allocated in device memory");

    REDUCTION_SUM(InputIMG, &MEAN, &Result_Mean);
 

    // Tmp1 is used as a transition between 2 convolution layers; Dims(1 x 1 x FilterDensity3)
    Matrix tmp1;
    Set_DeviceMatrix(1, 1, FilterDensity1, &tmp1, "Allocating tmp1 in device for transition");
 
    // tmp2 matrix is the result from sigmoid function: Dims(1 x 1 x FilterDensity4)
    Matrix tmp2;
    Set_DeviceMatrix( 1, 1, FilterDensity2, &tmp2, "Allocating tmp2 in device for final output");
 
    // Sequence: Conv1x1, swish, Conv1x1, sigmoid 
    // Warning: Remember to pre-process Result_Mean matrix to match 1 x 1 x C as it's the input in this case to Conv2d
    Set_HostMatrix(1, 1, InputIMG -> depth, &Result_Mean);
 
    Conv2d_Layer(&Result_Mean, Filter1, &tmp1, 1, 0, input_channels, output_channels, FilterDensity1,
                 Conv2d_1_x_1, SWISH_ACTIVATION,
                 BIASED, First_bias);
    
    Conv2d_Layer(&tmp1, Filter2, &tmp2, 1, 0, output_channels, input_channels, FilterDensity2,
                 Conv2d_1_x_1, SIGMOID_ACTIVATION,
                 BIASED, Second_bias);
 

    int nbx = (int)ceil((float)InputIMG -> width / DYNAMIC_TILE);
    int nby = (int)ceil((float)InputIMG -> height / DYNAMIC_TILE);
    int nbz = InputIMG -> depth;

    if (nbx == 0) nbx = 1;

    if (nby == 0) nby = 1;

    // This is the only kernel that runs 3d Grid; 
    // Each block in z dimension controls 1 channel  
    dim3 dim_Grid2(nbx, nby, nbz);
    dim3 dim_Block2(DYNAMIC_TILE, DYNAMIC_TILE, 1);

    // C then D, the final multiplication is in C matrix
    ConvChannelElementWiseMultiplication <<< dim_Grid2, dim_Block2 >>> (InputIMG -> elements,
                                                                        InputIMG -> height,
                                                                        InputIMG -> width,
                                                                        InputIMG -> depth,
                                                                        tmp2.elements);

   
    cudaFree(tmp1.elements);
    cudaFree(tmp2.elements);
}

// Warning: Fuction Input matrices are allocated in device memory directly
// InputIMG, FilterK and ConvOut are device memory allocations
void Conv2d_Layer(Matrix* InputIMG, Matrix* FilterK, Matrix* ConvOut,
                  int stride, int padding,
                  int InputChannels, int OutputChannels, int FilterDensity,
                  int Conv_Type, int activation_type,
                  int BIASED_CHOISE, Matrix *biasMat)
{
    //printf("The start of Conv2d layer\n\n");
    
    int OutputHeight = 0, OutputWidth = 0, OutputDepth = 0;

    // 1x1 Conv2d is a special case of Convolution
    if (Conv_Type == Conv2d_1_x_1)
    {
        // Conv2d 1x1 has stride = 1, no padding and K = 1

        /*
          Input Dimensions is the same as Output dimensions
          Only Depth of the output channels differ from input
        */
        OutputHeight = InputIMG -> height; OutputWidth = InputIMG -> width; OutputDepth = FilterDensity;

        /*
          Note: Set_HostMatrix function just changes the dimensions
                so it's okey to use on a device memory
        */
     
        // Modify Filter Matrix to have dimensions ((K^2 * M) x C x 1); K = 1
        Set_HostMatrix(1 * 1 * FilterDensity, InputIMG -> depth, 1, FilterK);

        // Modify Input matrix to have dimensions (C x (H * W) x 1)
        Set_HostMatrix(InputIMG -> depth, InputIMG -> height * InputIMG -> width, 1, InputIMG);

        // Modify Output Matrix preprocessing to have dimesions ((K^2 * M) x (H * W) x 1); K = 1
        Set_HostMatrix(1 * 1 * FilterDensity, OutputWidth * OutputHeight, 1, ConvOut);

        Conv_vidMultiplier(ConvOut, InputIMG, FilterK,
                            OutputHeight, OutputWidth, OutputDepth,
                            Conv2d_1_x_1, 1,
                            activation_type, 
                            BIASED_CHOISE, biasMat);
    }
    else if (Conv_Type == DWConv_k_x_k)
    {
        // Ptr is used to alternate between input image and padding if needed
        Matrix* ptr = InputIMG;

        // DWConv2d has stride = s, padding = p and kernel = k
        OutputHeight = ConvOut -> height; OutputWidth = ConvOut -> width; OutputDepth = ConvOut -> depth;

        Matrix padded_matr;
        if (padding != 0)
        {
            Padding_Zeros_Function(InputIMG, padding, &padded_matr);
            ptr = &padded_matr;
        }
       
        Conv_vidMultiplier(ConvOut, ptr, FilterK,
                            OutputHeight, OutputWidth, OutputDepth,
                            DWConv_k_x_k, stride,
                            activation_type, 
                            BIASED_CHOISE, biasMat);

        // Padded matrix is no longer needed as Convout has the final result
    }
    // Any other kernel size goes here
    else
    {        
        // Regular convolution: Filter and input unrolling
        Matrix* ptr = InputIMG;
        OutputHeight = (ptr -> height + 2 * padding - FilterK -> height) / stride + 1;
        OutputWidth = (ptr -> width + 2 * padding - FilterK -> width) / stride + 1;
        OutputDepth = FilterDensity;

        Matrix padded_matr;
        if (padding != 0)
        {
            Padding_Zeros_Function(InputIMG, padding, &padded_matr);
            ptr = &padded_matr;          
        }

        // 1st phase: Filter unrolling

        // Unrolled filter has dimesnios (M x (C * k * k) x 1)
        Set_HostMatrix(FilterDensity, FilterK -> depth / FilterDensity * FilterK -> height * FilterK -> width,
                      1, FilterK);

        // 2nd phase: Input unrolling

        // The unrolled Input matrix has dimensions((C * k * k) x (H_out * W_out) x 1)
        Matrix INPUT_MODIFIED;
        
        Set_DeviceMatrix(ptr -> depth * 3 * 3,
                        OutputHeight * OutputWidth, 1,
                        &INPUT_MODIFIED, 
                        "Input unrolled Matrix allocated in device memory");

        Input_Unroll_gpu(stride, ptr, &INPUT_MODIFIED, OutputHeight, OutputWidth, 3);

        // Convolution output has dimensions of (M x (H_out * W_out) x 1)
        Set_HostMatrix(FilterDensity, OutputWidth * OutputHeight, 1, ConvOut);

          
        // Perform Multiplication and re-edit the dimensions of output
        Conv_vidMultiplier(ConvOut, &INPUT_MODIFIED, FilterK,
                            OutputHeight, OutputWidth, OutputDepth,
                            Regular_Conv, stride,
                            activation_type,
                            BIASED_CHOISE, biasMat);

    }
 }

// 5 Filters needed to run the 4 layers sequentially
void MBConv_Layer(Matrix* Input, Matrix* MBConvOut,
    Matrix* F1, Matrix* F2, Matrix* F3, Matrix* F4, Matrix* F5,
    int FD1, int FD2, int FD3, int FD4, int FD5,
    int input_channels, int output_channels, int FilterSizeDW,
    int Stride, int padding, int skip,
    Matrix *bias1, Matrix *bias2,
    Matrix *MBConv_expansion_conv_BN_mean,     Matrix *MBConv_expansion_conv_BN_variance,
    Matrix *MBConv_expansion_conv_BN_weights,  Matrix *MBConv_expansion_conv_BN_bias,
    Matrix *MBConv_depthwise_conv_BN_mean,     Matrix *MBConv_depthwise_conv_BN_variance,
    Matrix *MBConv_depthwise_conv_BN_weights,  Matrix *MBConv_depthwise_conv_BN_bias,
    Matrix *MBConv_project_conv_BN_mean,       Matrix *MBConv_project_conv_BN_variance,
    Matrix *MBConv_project_conv_BN_weights,    Matrix *MBConv_project_conv_BN_bias)
{
    /*
      Note: MBConv1_0 doesn't have the expansion conv function;
            The input matrices to this function are device matrices;
            including all the filters, you don't need to allocate or
            copy any thing; just pass to the functions
    */

    /*
      ptr_mat is the pointer that gets past expansion conv;
      Meaning: in case of MBconv1_0 the pointer is same as input matrix;
                in case of any other MBConv6_! it's the output of Conv2d 
                and BN with swish
    */
    
    Matrix H_OUT;
    Matrix tmp1; Matrix *ptr_mat; 
 
    if (MBCONV1_0_flag == 1)
      ptr_mat = Input;
    else
    {     
      Set_DeviceMatrix(Input -> height, Input -> width , FD1, 
                       &tmp1,
                       "Output_1 is allocated in device memory"); 
               
      // 1st layer: 1x1 Conv2d, stride = 1, padding = 0, K = 1
      Conv2d_Layer(Input, F1, &tmp1, 1, 0,
                   input_channels, FD1, FD1,
                   Conv2d_1_x_1,
                   NO_ACTIVATION, 0, NULL);
  
      BN_ALL_PRE_DEFINED(&tmp1, SWISH_ACTIVATION, 
                          MBConv_expansion_conv_BN_mean,    MBConv_expansion_conv_BN_variance,
                          MBConv_expansion_conv_BN_weights, MBConv_expansion_conv_BN_bias);
      ptr_mat = &tmp1;
    }

   // 2nd Layer: KxK DWconv, stride = s, padding = p, K = k

    // Height and width changes, Only depth remains still
    int OutputHeight = (ptr_mat -> height + 2 * padding - FilterSizeDW)/Stride + 1;
    int OutputWidth = (ptr_mat -> width + 2 * padding - FilterSizeDW)/Stride + 1;
    int OutputDepth = ptr_mat -> depth;
 
    // Set and allocate tmp2 matrix; it's a transistion between expansion and squeeze
    Matrix tmp2;
    Set_DeviceMatrix(OutputHeight, OutputWidth, OutputDepth, &tmp2,
                    "Output_2 is allocated in device memory");    

    Conv2d_Layer(ptr_mat, F2, &tmp2,
                 Stride, padding, FD1, FD2, FD2, DWConv_k_x_k,
                 NO_ACTIVATION, 0, NULL);
  

    BN_ALL_PRE_DEFINED(&tmp2, SWISH_ACTIVATION, 
                       MBConv_depthwise_conv_BN_mean,     MBConv_depthwise_conv_BN_variance,
                       MBConv_depthwise_conv_BN_weights,  MBConv_depthwise_conv_BN_bias);
  
    // 3rd Layer: squeeze and excitation

    /*
      Squeeze excite layer doesn't change the final output dimensions;
      SE_OUT can be removed; Do so later
    */
 
    Matrix *SE_OUT;
    Squeeze_and_Excite(&tmp2, SE_OUT, F3, F4,
                        FD4, FD3, FD4, FD3,
                        bias1, bias2);

    // 4th Layer: 1x1 Conv2d
    // MBConv output pointer is set and finally updated after this layer execution
    Set_DeviceMatrix(tmp2.height, tmp2.width, FD5, MBConvOut,
                     "Matrix final output is allocated in device memory");
 

    // 1x1 Conv2d layer
    Conv2d_Layer(&tmp2, F5, MBConvOut, 1, 0, FD4, FD5, FD5, Conv2d_1_x_1,
                 NO_ACTIVATION, 0, NULL);


    // BatchNorm layer
    BN_ALL_PRE_DEFINED(MBConvOut, NO_ACTIVATION, 
                       MBConv_project_conv_BN_mean,     MBConv_project_conv_BN_variance,
                       MBConv_project_conv_BN_weights,  MBConv_project_conv_BN_bias);

    // Skip identity layer
    if(skip)
    {
      MBConv_SKIP_IDENTITY(MBConvOut, Input);
    }
}


void MBConv_SKIP_IDENTITY(Matrix *parent, Matrix *child)
{
    int nbx = (int)ceil((float)parent -> width / DYNAMIC_TILE);
    int nby = (int)ceil((float)parent -> height / DYNAMIC_TILE);
    int nbz = parent -> depth;

    if (nbx == 0) nbx = 1;

    if (nby == 0) nby = 1;

    // This is the only kernel that runs 3d Grid; 
    // Each block in z dimension controls 1 channel  
    dim3 dim_Grid2(nbx, nby, nbz);
    dim3 dim_Block2(DYNAMIC_TILE, DYNAMIC_TILE, 1);
     
    Identity_Skip <<<dim_Grid2, dim_Block2 >>> (parent -> elements,
                                                  parent -> height,
                                                  parent -> width,
                                                  parent -> depth, 
                                                  child -> elements);
}

void BN_ALL_PRE_DEFINED(Matrix* D_input, int activate, Matrix *mean, Matrix *variance, Matrix *weights, Matrix *bias)
{
    /* The ptr matrix is a device matrix */
     
    /*
      All weights, bias, running mean and running variance
      are pre-defined. Just call the function and use the
      matrices.
      
      All bias, weights, mean and bariance matrices are 1x1xC

      Output Matrix is modified by the equation
      (y = ((x - Mean) / (sqrt(variance) + epsilon)) * weights + bais)
    */

    int nbx = (int)ceil((float)D_input -> width / DYNAMIC_TILE);
    int nby = (int)ceil((float)D_input -> height / DYNAMIC_TILE);
    int nbz = D_input -> depth;

    if (nbx == 0) nbx = 1;
    if (nby == 0) nby = 1;

    // This is the only kernel that runs 3d Grid; 
    // Each block in z dimension controls 1 channel  
    dim3 dim_Grid3(nbx, nby, nbz);
    dim3 dim_Block3(DYNAMIC_TILE, DYNAMIC_TILE, 1);

    BN_Kernel_Final_Layer <<< dim_Grid3, dim_Block3 >>> (D_input -> elements,
                                                         D_input -> height,
                                                         D_input -> width,
                                                         D_input -> depth,
                                                         mean -> elements, variance -> elements,
                                                         weights -> elements, bias -> elements,
                                                         activate);
}

void Padding_Zeros_Function(Matrix* Original_Matrix_Before, int padding_Value, Matrix* padded_Matrix)
{
    /* 
      Note: Matrix coming is a device elemente matrix;
            Original Matrix is a Device input that needs padding
            padded_Matrix is the return of this function;

      Warning: Padded_Matrix has a different size than the Original 
                non padded matrix and it's not allocated in device yet.
                The allocateion is done inside this function.
    */    

    Set_DeviceMatrix(Original_Matrix_Before->height + 2 * padding_Value,
                      Original_Matrix_Before->width + 2 * padding_Value,
                      Original_Matrix_Before->depth,
                      padded_Matrix,
                      "Padded Matrix is allocated in device memory.");

    // 1st: Set padded Matrix with all zeros
    cudaMemset(padded_Matrix -> elements,
               0, padded_Matrix->height * padded_Matrix->width * padded_Matrix->depth * sizeof(float)); 

    int nbx = (int)ceil((float)padded_Matrix -> width / DYNAMIC_TILE);
    int nby = (int)ceil((float)padded_Matrix -> height / DYNAMIC_TILE);
    int nbz = padded_Matrix -> depth;

    if (nbx == 0) nbx = 1;

    if (nby == 0) nby = 1;

    dim3 dim_Grid2(nbx, nby, nbz);
    dim3 dim_Block2(DYNAMIC_TILE, DYNAMIC_TILE, 1);

    // Pass to the copying strided kernel to complete the padding process
 
    Complete_Padding_Process <<< dim_Grid2, dim_Block2 >>> (padded_Matrix -> elements,
                                                            padded_Matrix -> height,
                                                            padded_Matrix -> width,
                                                            padded_Matrix -> depth,
                                                            Original_Matrix_Before -> elements,
                                                            Original_Matrix_Before -> height,
                                                            Original_Matrix_Before -> width,
                                                            Original_Matrix_Before -> depth,
                                                            padding_Value);
}


// Call this function directly for 1x1 conv2d. Don't call for DWConv
void Conv_vidMultiplier(Matrix* out_11, Matrix* D_2, Matrix* D_1,
                        int ReconstructOutHieght, int ReconstructOutWidth, int ReconstructOutDepth,
                        int ConvType, int stride_DW, int activation_type, int BIASED_CHOISE, Matrix *biasMat)
{
    /* Note: Out_11, XXX_Trans and Host_Conv_Filter are device matrices */
 
    // The multiplication kernel is used for the 1x1 Conv2d and kxk Conv2d
    if (ConvType == Conv2d_1_x_1 || ConvType == Regular_Conv)
    {    
        // Get number of blocks
        int nbx = (int)ceil((float)out_11 -> width / (THREAD_GRANULARITY_BLOCKS * Tile_GEMM));
        int nby = (int)ceil((float)out_11 -> height / Tile_GEMM);
        int num_block_for_phases = (int)ceil((float)D_1 -> width / Tile_GEMM);

        // Check for zero blocks to make sure code runs correctly
        if (nbx == 0) nbx = 1;
        if (nby == 0) nby = 1;

        dim3 dim_Grid2(nbx, nby, 1);
        dim3 dim_Block2(Tile_GEMM, Tile_GEMM, 1);
     
        if (BIASED_CHOISE == BIASED)
        {
          Set_HostMatrix(out_11 -> height, 1, 1, biasMat);

          // Call shared memory tiled Multiplication  algorithm
          MatrixMulKernel <<< dim_Grid2, dim_Block2 >>> (D_1 -> elements, D_1 -> height, D_1 -> width, D_1 -> depth,
                                                         D_2 -> elements, D_2 -> height, D_2 -> width, D_2 -> depth,
                                                         out_11 -> elements, out_11 -> height, out_11 -> width, out_11 -> depth,
                                                         num_block_for_phases, activation_type,
                                                         BIASED_CHOISE, biasMat -> elements);         
        }
        else
        {
          MatrixMulKernel <<< dim_Grid2, dim_Block2 >>> (D_1 -> elements, D_1 -> height, D_1 -> width, D_1 -> depth,
                                                         D_2 -> elements, D_2 -> height, D_2 -> width, D_2 -> depth,
                                                         out_11 -> elements, out_11 -> height, out_11 -> width, out_11 -> depth,
                                                         num_block_for_phases, activation_type,
                                                         BIASED_CHOISE, NULL);        
         }    
    }

    // This case is for DWConv2d
    else
    {
        int nbx = (int)ceil((float)out_11 -> width / TileDW);
        int nby = (int)ceil((float)out_11 -> height / TileDW);
        int nbz = out_11 -> depth;
     
        if (nbx == 0) nbx = 1;
        if (nby == 0) nby = 1;

        // This is the only kernel that runs 3d Grid; 
        // Each block in z dimension controls 1 channel  
        dim3 dim_Grid2(nbx, nby, nbz);
        dim3 dim_Block2(TileDW, TileDW, 1);


        DWConv2d_kernel << < dim_Grid2, dim_Block2 >> > (D_2 -> elements, D_2 -> height, D_2 -> width, D_2 -> depth,
                                                         D_1 -> elements, D_1 -> height, D_1 -> width, D_1 -> depth,
                                                         out_11 -> elements, out_11 -> height, out_11 -> width, out_11 -> depth,
                                                         stride_DW);          
    }
 
    // Reset the output dimensions to continue in the network
    Set_HostMatrix(ReconstructOutHieght, ReconstructOutWidth, ReconstructOutDepth, out_11);
}

void Input_Unroll_gpu(int st_stride, Matrix* Device_Input, Matrix* Device_Unrolled, int O_H, int O_W, int Filter_Size)
{   
    /* Note: All the function input matrices are device matrices.
            Device_Input matrix is already allocated and ready.
            Device_Unrolled matrix is already allocated and ready. 
    */
    
    int nbx = (int)ceil((float)O_W / TileDW);
    int nby = (int)ceil((float)O_H / TileDW);
    int nbz = Device_Input -> depth;

    if (nbx == 0) nbx = 1;

    if (nby == 0) nby = 1;
 
    dim3 dim_Grid2(nbx, nby, nbz);
    dim3 dim_Block2(TileDW, TileDW, 1);

    // You need to use cudaDeviceSynchronize if the kernel isn't working

    INPUT_UNROLLING <<< dim_Grid2, dim_Block2 >>> (st_stride, Filter_Size,
                                                   
                                                   Device_Input -> elements,
                                                   Device_Input -> height,
                                                   Device_Input -> width,
                                                   Device_Input -> depth,

                                                   Device_Unrolled -> elements,
                                                   Device_Unrolled -> height,
                                                   Device_Unrolled -> width,
                                                   Device_Unrolled -> depth,

                                                   O_H, O_W);

    
    //cudaDeviceSynchronize(); 

    cudaError err = cudaGetLastError();

    if ( err != cudaSuccess )
    {
      printf("CUDA Error: %s\n", cudaGetErrorString(err));
      exit(-1);
    } 
}

void DEFINE_FILTERS_FOR_MBCONV_BN(  Matrix *EXP_MEAN, 		  float *filter1, int size_1,
                                    Matrix *EXP_VARIANCE, 	float *filter2, int size_2,
                                    Matrix *EXP_WEIGHTS, 	  float *filter3, int size_3,
                                    Matrix *EXP_BIAS, 		  float *filter4, int size_4,
                                  
                                    Matrix *DW_MEAN, 		    float *filter5, int size_5,
                                    Matrix *DW_VARIANCE, 	  float *filter6, int size_6,
                                    Matrix *DW_WEIGHTS, 		float *filter7, int size_7,
                                    Matrix *DW_BIAS, 		    float *filter8, int size_8,
                                    
                                    Matrix *PRJ_MEAN, 		  float *filter9,  int size_9,
                                    Matrix *PRJ_VARIANCE, 	float *filter10, int size_10,
                                    Matrix *PRJ_WEIGHTS, 	  float *filter11, int size_11,
                                    Matrix *PRJ_BIAS, 		  float *filter12, int size_12)
{
  if (MBCONV1_0_flag);
  else
  {
    set_allocate_copy_array_Device(EXP_MEAN, filter1,
                      size_1, 1, 1,
                      "expand mean"); 
    set_allocate_copy_array_Device(EXP_VARIANCE, filter2,
                      size_2, 1, 1,
                      "expand variance"); 
    set_allocate_copy_array_Device(EXP_WEIGHTS, filter3,
                      size_3, 1, 1,
                      "expand weights"); 
    set_allocate_copy_array_Device(EXP_BIAS, filter4,
                      size_4, 1, 1,
                      "expand bias");
  } 
									  
	set_allocate_copy_array_Device(DW_MEAN, filter5,
									  size_5, 1, 1,
									  "DW mean"); 
	set_allocate_copy_array_Device(DW_VARIANCE, filter6,
									  size_6, 1, 1,
									  "DW variance"); 
	set_allocate_copy_array_Device(DW_WEIGHTS, filter7,
									  size_7, 1, 1,
									  "DW weights"); 
	set_allocate_copy_array_Device(DW_BIAS, filter8,
									  size_8, 1, 1,
									  "expand bias"); 

	set_allocate_copy_array_Device(PRJ_MEAN, filter9,
									  size_9, 1, 1,
									  "DW mean"); 
	set_allocate_copy_array_Device(PRJ_VARIANCE, filter10,
									  size_10, 1, 1,
									  "DW variance"); 
	set_allocate_copy_array_Device(PRJ_WEIGHTS, filter11,
									  size_11, 1, 1,
									  "DW weights"); 
	set_allocate_copy_array_Device(PRJ_BIAS, filter12,
									  size_12, 1, 1,
									  "expand bias"); 
}

// 3 Sequential Operations: Same as "set_allocate_copy_Matrix_Device",
// However, it uses a pointer to float as a parent.
void set_allocate_copy_array_Device(Matrix *child, float *parent,
									int height, int width, int depth,
									char *notification)
{
	Set_DeviceMatrix(height, width, depth, child, notification);

	size_t size = height * width * depth * sizeof(float);
 
	cudaError err = cudaMemcpy(child -> elements, parent, size,
								cudaMemcpyHostToDevice);
  
	CheckCudaError(notification, err);
}

// 3 Sequential Operations: Set dimensions, allocate device memory and copy.
void set_allocate_copy_Matrix_Device(Matrix *child, Matrix *parent, char *notification)
{
	Set_DeviceMatrix(parent -> height, parent -> width, parent -> depth,
					          child, notification);

	size_t size = parent -> height * parent -> width * parent -> depth * sizeof(float);
	
	cudaError err = cudaMemcpy(child -> elements, parent -> elements,
								              size, cudaMemcpyHostToDevice);
	CheckCudaError(notification, err);
}

void set_allocate_copy_Matrix_Device_specific(Matrix *child, Matrix *parent, char *notification, int height, int width, int depth)
{
	Set_DeviceMatrix(height, width, depth, child, notification);

	size_t size = child -> height * child -> width * child -> depth * sizeof(float);
	

	cudaError err = cudaMemcpy(child -> elements, parent -> elements,
								            size, cudaMemcpyHostToDevice);

  
	CheckCudaError(notification, err);
}

void just_copy_HTD(Matrix *child, Matrix *parent, char *notification)
{
    // Read C from device memory
  size_t size = parent -> width * parent -> height * parent -> depth * sizeof(float);
    
	cudaError err = cudaMemcpy(child -> elements, parent -> elements, size, cudaMemcpyHostToDevice);

  
	CheckCudaError(notification, err);
}

void just_copy_DTH(Matrix *child, Matrix *parent, char *notification)
{
  // Read C from device memory
  size_t size = parent -> width * parent -> height * parent -> depth * sizeof(float);
  

	cudaError err = cudaMemcpy(child -> elements, parent -> elements, size, cudaMemcpyDeviceToHost);
  
	CheckCudaError(notification, err);
}

void set_allocate_Host(Matrix *ptr, int height, int width, int depth)
{
	// Note this function allocates memory, remember to free 
	Set_HostMatrix(height, width, depth, ptr);
	
	int Fsize = height * width * depth* sizeof(float);
 
	ptr -> elements = (float *) malloc(Fsize);
}

void FreeHost_Allocated(Matrix *ptr)
{
	free(ptr -> elements);
}

// Allocations for Device matrices
void Set_DeviceMatrix(int height, int width, int depth, Matrix* ptr, char* NamePtr)
{
    ptr->width = width;
    ptr->height = height;
    ptr->depth = depth;

    size_t size = width * height * depth * sizeof(float);
    cudaError err = cudaMalloc((void **)&(ptr->elements), size);
    CheckCudaError(NamePtr, err);
}

void Set_HostMatrix(int height, int width, int depth, Matrix* ptr)
{
    ptr->width = width;
    ptr->height = height;
    ptr->depth = depth;
}

void CheckCudaError(char* ptr, cudaError err)
{
    if (err == cudaSuccess);
    else
        printf("CUDA error in %s: %s\n", ptr, cudaGetErrorString(err));
}


void show_me_enhanced(Matrix* ptr, char* NamePtr)
{
    if(show_out == 1)
    {
      setvbuf(stdout, NULL, _IOLBF, 0);

          printf("%s,"
              "it has height = %d, "
              "width = %d, "
              "depth = %d \n",
              NamePtr, ptr->height, ptr->width, ptr->depth);

          printf("{\n");
          for (int i = 0; i < ptr -> height * ptr -> width * ptr -> depth; i++)
          {
              if (i % ptr->width == 0 && i >= ptr->width)
                  printf("\n");

              if (i % (ptr->width * ptr->height) == 0 && i >= (ptr->width * ptr->height));
                  //printf("\n");

              printf("%.8f", ptr->elements[i]);
              if (i + 1 == ptr->height * ptr->width * ptr->depth);
              else
                  printf(", ");
          }

          printf("} \n");
          printf("\n");

          setvbuf(stdout, NULL, _IOLBF, 0);        
    }
}


void start()
{
  HANDLE_ERROR(cudaEventCreate(&start_timing));
  HANDLE_ERROR(cudaEventCreate(&stop_timing));
  HANDLE_ERROR(cudaEventRecord(start_timing, 0));
}

void stop(char *notification, int pause_time)
{
  HANDLE_ERROR(cudaEventRecord(stop_timing, 0));
  HANDLE_ERROR(cudaEventSynchronize(stop_timing));
  HANDLE_ERROR(cudaEventElapsedTime(&time_defined, start_timing, stop_timing));
 
  if(pause_time)
 {
    tmp_time += time_defined; 
 }   
 
  else
  {
    tmp_time = 0;
    printf("Time elapsed for %s:  %.8f ms\n", notification, time_defined);  
    total_time_for_layer += time_defined;
  }
}

void after_pause(char *notification)
{
  printf("Time elapsed for %s: %.8f ms\n", notification, tmp_time); 
  total_time_for_layer += tmp_time;
 
  tmp_time = 0;         
}

void reset_time()
{
  printf("Total time: %.8f ms\n", total_time_for_layer); 
  total_time_for_layer = 0;
}

void show_me_enhanced_from_devince(Matrix *ptr, char *notification)
{
    Matrix H_OUT;

    set_allocate_Host(&H_OUT, ptr -> height, ptr -> width, ptr -> depth);

    just_copy_DTH(&H_OUT, ptr, "show_device_elements");
  
    show_out = 1;
    show_me_enhanced(&H_OUT, notification);
    show_out = 0;  
}

'File written in /content/src/FUNCTIONS.cu'

In [None]:
!nvcc -o /content/src/EfficientNet /content/src/APP.cu /content/src/KERNELS.cu /content/src/FUNCTIONS.cu --use_fast_math 

In [159]:
!/content/src/EfficientNet

Time elapsed for Model: :  7.98169613 ms
Model final output::,it has height = 1, width = 1000, depth = 1 
{
1.08116734, -0.15231955, 0.81180447, -0.55637485, -1.72109163, -0.12775655, -0.30983409, -0.72759855, -0.36455533, -1.96505380, -1.22603118, 0.09439833, -0.24272297, -0.90570623, 0.13351761, -1.01413095, -1.73747325, -1.51516759, 0.19024637, 0.31977376, -0.67285043, -0.78116024, 0.08444787, -0.50209820, -1.27981591, 0.44497553, 0.14523719, 0.15833744, 0.28457651, 2.25681019, -0.87878931, -0.80018413, -0.09799896, -0.37915513, -0.84588838, -0.61278945, 0.65202796, 0.74403727, 0.40012741, 0.17943129, 0.69742870, 0.06787872, -0.85715145, -0.03864693, 0.56585681, 0.79732269, -0.64108318, 0.22881541, 0.24225605, -0.01618855, -1.18664181, 0.80330795, 0.29770195, 0.04999758, 0.31598836, -1.34917235, -0.24388976, -1.53631532, 0.42537394, -0.64515626, -0.65929657, -0.46719217, -0.87873405, 0.10817172, 0.04329031, -0.44704410, -0.63170409, -0.59761018, -0.32319057, 0.09391501, 0.04296068, 

In [None]:
!nvprof /content/src/EfficientNet
!nvidia-smi

In [161]:
%%cu
#include <stdio.h> 

int main() {
  int nDevices;

  cudaGetDeviceCount(&nDevices);
  for (int i = 0; i < nDevices; i++) {
    cudaDeviceProp prop;
    cudaGetDeviceProperties(&prop, i);
    printf("Device Number: %d\n", i);
    printf("  Device name: %s\n", prop.name);
    printf("  Memory Clock Rate (KHz): %d\n",             prop.memoryClockRate);
    printf("  Memory Bus Width (bits): %d\n",             prop.memoryBusWidth);
    printf("  Number of totalGlobalMem %lu\n",            prop.totalGlobalMem); 
    printf("  Number of sharedMemPerBlock %lu\n",         prop.sharedMemPerBlock); 

    printf("\n  Number of warpSize %d\n",                   prop.warpSize); 
    printf("  Number of maxThreadsPerBlock %d\n",         prop.maxThreadsPerBlock); 
    printf("  Number of maxBlocksPerMultiProcessor %d\n", prop.maxBlocksPerMultiProcessor); 
    printf("  Number of multiProcessorCount %d\n",        prop.multiProcessorCount); 
    printf("  Number of maxThreadsPerMultiProcessor %lu\n",prop.maxThreadsPerMultiProcessor); 

    printf("\n  Number of maxThreadsDim %d\n",              prop.maxThreadsDim[0]); 
    printf("  Number of maxGridSize %d\n",                prop.maxGridSize[0]); 
    printf("  Number of totalConstMem %d\n",              prop.totalConstMem); 
    printf("  Number of sharedMemPerMultiprocessor %d\n", prop.sharedMemPerMultiprocessor); 
             
    printf("  Peak Memory Bandwidth (GB/s): %f\n\n", 2.0 * prop.memoryClockRate*(prop.memoryBusWidth/8)/1.0e6);
    printf("deviceOverlap is 1 if the device can concurrently copy memory between host and device while executing a kernel. It's: %d \n", prop.deviceOverlap);
    printf("asyncEngineCount: %d", prop.asyncEngineCount);
  }
}

Device Number: 0
  Device name: Tesla T4
  Memory Clock Rate (KHz): 5001000
  Memory Bus Width (bits): 256
  Number of totalGlobalMem 15843721216
  Number of sharedMemPerBlock 49152

  Number of warpSize 32
  Number of maxThreadsPerBlock 1024
  Number of maxBlocksPerMultiProcessor 16
  Number of multiProcessorCount 40
  Number of maxThreadsPerMultiProcessor 1024

  Number of maxThreadsDim 1024
  Number of maxGridSize 2147483647
  Number of totalConstMem 65536
  Number of sharedMemPerMultiprocessor 65536
  Peak Memory Bandwidth (GB/s): 320.064000

deviceOverlap is 1 if the device can concurrently copy memory between host and device while executing a kernel. It's: 1 
asyncEngineCount: 3


In [15]:
#  struct cudaDeviceProp {
#               int major;
#               int minor;
#               size_t textureAlignment;
#               size_t texturePitchAlignment;
#               int canMapHostMemory;
#               int computeMode;
#               int maxTexture1D;
#               int maxTexture1DMipmap;
#               int maxTexture1DLinear;
#               int maxTexture2D[2];
#               int maxTexture2DMipmap[2];
#               int maxTexture2DLinear[3];
#               int maxTexture2DGather[2];
#               int maxTexture3D[3];
#               int maxTexture3DAlt[3];
#               int maxTextureCubemap;
#               int maxTexture1DLayered[2];
#               int maxTexture2DLayered[3];
#               int maxTextureCubemapLayered[2];
#               int maxSurface1D;
#               int maxSurface2D[2];
#               int maxSurface3D[3];
#               int maxSurface1DLayered[2];
#               int maxSurface2DLayered[3];
#               int maxSurfaceCubemap;
#               int maxSurfaceCubemapLayered[2];
#               size_t surfaceAlignment;
#               int concurrentKernels;
#               int ECCEnabled;
#               int pciBusID;
#               int pciDeviceID;
#               int pciDomainID;
#               int tccDriver;
#               int asyncEngineCount;
#               int unifiedAddressing;
#               int memoryClockRate;
#               int memoryBusWidth;
#               int l2CacheSize;
#               int persistingL2CacheMaxSize;
#               int maxThreadsPerMultiProcessor;
#               int streamPrioritiesSupported;
#               int globalL1CacheSupported;
#               int localL1CacheSupported;
#               size_t sharedMemPerMultiprocessor;
#               int regsPerMultiprocessor;
#               int managedMemory;
#               int isMultiGpuBoard;
#               int multiGpuBoardGroupID;
#               int singleToDoublePrecisionPerfRatio;
#               int pageableMemoryAccess;
#               int concurrentManagedAccess;
#               int computePreemptionSupported;
#               int canUseHostPointerForRegisteredMem;
#               int cooperativeLaunch;
#               int cooperativeMultiDeviceLaunch;
#               int pageableMemoryAccessUsesHostPageTables;
#               int directManagedMemAccessFromHost;
#               int accessPolicyMaxWindowSize;