In [1]:
# Step 1: Download the pre-built ROOT tarball from GitHub Releases
!wget -q --show-progress https://github.com/MohamedElashri/ROOT/releases/download/ubuntu/root_v6.30.04_Ubuntu_Python3.11.zip
# Step 2: Extract the ROOT files
!unzip -q root_v6.30.04_Ubuntu_Python3.11.zip

# Step 3: Install missing system dependencies for ROOT
!sudo ldconfig & apt-get install -y git dpkg-dev cmake g++ gcc binutils libx11-dev libxpm-dev libxft-dev libxext-dev tar gfortran subversion libpython3.11-dev

# Step 4: Remove the tarball to free up space
!rm -f root_v6.30.04_Ubuntu_Python3.11.zip

# Step 5: Install Compatible libssl

!wget http://archive.ubuntu.com/ubuntu/pool/main/o/openssl/libssl1.1_1.1.1f-1ubuntu2_amd64.deb
!sudo dpkg -i libssl1.1_1.1.1f-1ubuntu2_amd64.deb
!rm -f libssl1.1_1.1.1f-1ubuntu2_amd64.deb


/sbin/ldconfig.real: /usr/local/lib/libtbbmalloc.so.2 is not a symbolic link

/sbin/ldconfig.real: /usr/local/lib/libtbbmalloc_proxy.so.2 is not a symbolic link

/sbin/ldconfig.real: /usr/local/lib/libur_loader.so.0 is not a symbolic link

/sbin/ldconfig.real: /usr/local/lib/libtbbbind.so.3 is not a symbolic link

/sbin/ldconfig.real: /usr/local/lib/libur_adapter_opencl.so.0 is not a symbolic link

/sbin/ldconfig.real: /usr/local/lib/libtcm_debug.so.1 is not a symbolic link

/sbin/ldconfig.real: /usr/local/lib/libumf.so.0 is not a symbolic link

/sbin/ldconfig.real: /usr/local/lib/libtcm.so.1 is not a symbolic link

/sbin/ldconfig.real: /usr/local/lib/libtbbbind_2_0.so.3 is not a symbolic link

/sbin/ldconfig.real: /usr/local/lib/libtbbbind_2_5.so.3 is not a symbolic link

/sbin/ldconfig.real: /usr/local/lib/libtbb.so.12 is not a symbolic link

/sbin/ldconfig.real: /usr/local/lib/libur_adapter_level_zero.so.0 is not a symbolic link

/sbin/ldconfig.real: /usr/local/lib/libhwloc.so.15 is

In [2]:
!mkdir -p /content/tmva_cuda_project/include/TMVA
!mkdir -p /content/tmva_cuda_project/src
!mkdir -p /content/tmva_cuda_project/test
!mkdir -p /content/tmva_cuda_project/build

In [3]:
%%writefile /content/tmva_cuda_project/include/TMVA/ROperator_BatchNorm_CUDA.hxx
#ifndef TMVA_SOFIE_ROPERATOR_BATCHNORM_CUDA
#define TMVA_SOFIE_ROPERATOR_BATCHNORM_CUDA

#include "TMVA/ROperator.hxx"
#include <cuda_runtime.h>
#include <vector>
#include <string>

namespace TMVA {
namespace Experimental {
namespace SOFIE {

template <typename T>
class ROperator_BatchNorm_CUDA final : public ROperator
{
private:
   std::string fNX;      // Input tensor name
   std::string fNY;      // Output tensor name
   std::string fNScale;  // Scale tensor name
   std::string fNBias;   // Bias tensor name
   std::string fNMean;   // Mean tensor name
   std::string fNVar;    // Variance tensor name

   float fEpsilon;       // Epsilon for numerical stability
   int fNumChannels;     // Number of channels (C dimension)
   std::vector<size_t> fInputShape;   // Input tensor shape

   int fSpatialDims;     // Number of spatial dimensions
   std::string fType;    // Type string

public:
   ROperator_BatchNorm_CUDA() = default;

   ROperator_BatchNorm_CUDA(std::string nameX, std::string nameScale, std::string nameBias,
                            std::string nameMean, std::string nameVar, std::string nameY,
                            float epsilon = 1e-5f);

   // Required ROperator interface methods
   std::vector<ETensorType> TypeInference(std::vector<ETensorType> input) {
      return input;
   }

   std::vector<std::vector<size_t>> ShapeInference(std::vector<std::vector<size_t>> input) {
      return {input[0]};  // Output shape matches input shape
   }

   void Initialize(RModel& model) override;
   std::string Generate(std::string OpName) override;
};

// Declare specializations
extern template class ROperator_BatchNorm_CUDA<float>;
extern template class ROperator_BatchNorm_CUDA<double>;

}}} // namespace TMVA::Experimental::SOFIE

#endif // TMVA_SOFIE_ROPERATOR_BATCHNORM_CUDA

Writing /content/tmva_cuda_project/include/TMVA/ROperator_BatchNorm_CUDA.hxx


In [4]:
%%writefile /content/tmva_cuda_project/src/ROperator_BatchNorm_CUDA.cu
#include "TMVA/ROperator_BatchNorm_CUDA.hxx"
#include "TMVA/RModel.hxx"
#include <sstream>
#include <iostream>
#include <iomanip>

// CUDA kernel for BatchNorm (float version)
__global__ void batchNormKernelFloat(const float* input, const float* scale, const float* bias,
                                   const float* mean, const float* var, float* output,
                                   size_t N, size_t C, size_t spatialSize, float epsilon) {
    // Calculate global thread index
    int idx = blockIdx.x * blockDim.x + threadIdx.x;

    // Total elements = N * C * spatialSize
    int total = N * C * spatialSize;

    if (idx < total) {
        // Calculate indices for the current element
        int n = idx / (C * spatialSize);           // Batch index
        int c = (idx / spatialSize) % C;           // Channel index
        int spatialIdx = idx % spatialSize;        // Spatial index

        // Apply batch normalization formula
        float normalized = (input[idx] - mean[c]) / sqrtf(var[c] + epsilon);
        output[idx] = scale[c] * normalized + bias[c];
    }
}

// CUDA kernel for BatchNorm (double version)
__global__ void batchNormKernelDouble(const double* input, const double* scale, const double* bias,
                                    const double* mean, const double* var, double* output,
                                    size_t N, size_t C, size_t spatialSize, double epsilon) {
    // Calculate global thread index
    int idx = blockIdx.x * blockDim.x + threadIdx.x;

    // Total elements = N * C * spatialSize
    int total = N * C * spatialSize;

    if (idx < total) {
        // Calculate indices for the current element
        int n = idx / (C * spatialSize);           // Batch index
        int c = (idx / spatialSize) % C;           // Channel index
        int spatialIdx = idx % spatialSize;        // Spatial index

        // Apply batch normalization formula
        double normalized = (input[idx] - mean[c]) / sqrt(var[c] + epsilon);
        output[idx] = scale[c] * normalized + bias[c];
    }
}

namespace TMVA {
namespace Experimental {
namespace SOFIE {

template <typename T>
ROperator_BatchNorm_CUDA<T>::ROperator_BatchNorm_CUDA(
    std::string nameX, std::string nameScale, std::string nameBias,
    std::string nameMean, std::string nameVar, std::string nameY, float epsilon) :
    fNX(UTILITY::Clean_name(nameX)),
    fNScale(UTILITY::Clean_name(nameScale)),
    fNBias(UTILITY::Clean_name(nameBias)),
    fNMean(UTILITY::Clean_name(nameMean)),
    fNVar(UTILITY::Clean_name(nameVar)),
    fNY(UTILITY::Clean_name(nameY)),
    fEpsilon(epsilon),
    fNumChannels(0),
    fSpatialDims(0)
{
    fInputTensorNames = {fNX, fNScale, fNBias, fNMean, fNVar};
    fOutputTensorNames = {fNY};

    if (std::is_same<T, float>::value) {
        fType = "float";
    } else if (std::is_same<T, double>::value) {
        fType = "double";
    } else {
        throw std::runtime_error("TMVA SOFIE Encountered unsupported type parsing a BatchNorm CUDA operator");
    }
}

template <typename T>
void ROperator_BatchNorm_CUDA<T>::Initialize(RModel& model)
{
    // Check if all input tensors exist
    if (!model.CheckIfTensorAlreadyExist(fNX)) {
        throw std::runtime_error("TMVA SOFIE BatchNorm CUDA Op: Input tensor " + fNX + " not found in model");
    }
    if (!model.CheckIfTensorAlreadyExist(fNScale)) {
        throw std::runtime_error("TMVA SOFIE BatchNorm CUDA Op: Scale tensor " + fNScale + " not found in model");
    }
    if (!model.CheckIfTensorAlreadyExist(fNBias)) {
        throw std::runtime_error("TMVA SOFIE BatchNorm CUDA Op: Bias tensor " + fNBias + " not found in model");
    }
    if (!model.CheckIfTensorAlreadyExist(fNMean)) {
        throw std::runtime_error("TMVA SOFIE BatchNorm CUDA Op: Mean tensor " + fNMean + " not found in model");
    }
    if (!model.CheckIfTensorAlreadyExist(fNVar)) {
        throw std::runtime_error("TMVA SOFIE BatchNorm CUDA Op: Variance tensor " + fNVar + " not found in model");
    }

    // Get input shape and validate
    fInputShape = model.GetTensorShape(fNX);

    // BatchNorm requires at least 2D tensor (N, C, ...) where C is channels
    if (fInputShape.size() < 2) {
        throw std::runtime_error("TMVA SOFIE BatchNorm CUDA Op: Input tensor must have at least 2 dimensions");
    }

    // Number of channels is the second dimension
    fNumChannels = fInputShape[1];

    // Number of spatial dimensions is total dims minus batch and channel dims
    fSpatialDims = fInputShape.size() - 2;

    // Validate parameters shapes (scale, bias, mean, var should all be 1D with C elements)
    std::vector<size_t> scaleShape = model.GetTensorShape(fNScale);
    std::vector<size_t> biasShape = model.GetTensorShape(fNBias);
    std::vector<size_t> meanShape = model.GetTensorShape(fNMean);
    std::vector<size_t> varShape = model.GetTensorShape(fNVar);

    if (scaleShape.size() != 1 || scaleShape[0] != fNumChannels) {
        throw std::runtime_error("TMVA SOFIE BatchNorm CUDA Op: Scale tensor must be 1D with C elements");
    }
    if (biasShape.size() != 1 || biasShape[0] != fNumChannels) {
        throw std::runtime_error("TMVA SOFIE BatchNorm CUDA Op: Bias tensor must be 1D with C elements");
    }
    if (meanShape.size() != 1 || meanShape[0] != fNumChannels) {
        throw std::runtime_error("TMVA SOFIE BatchNorm CUDA Op: Mean tensor must be 1D with C elements");
    }
    if (varShape.size() != 1 || varShape[0] != fNumChannels) {
        throw std::runtime_error("TMVA SOFIE BatchNorm CUDA Op: Variance tensor must be 1D with C elements");
    }

    // Add output tensor with same shape as input
    model.AddIntermediateTensor(fNY, model.GetTensorType(fNX), fInputShape);

    if (model.Verbose()) {
        std::cout << "BatchNorm CUDA: " << fNX << " -> " << fNY;
        std::cout << " (epsilon=" << fEpsilon << ", channels=" << fNumChannels;
        std::cout << ", spatial_dims=" << fSpatialDims << ")" << std::endl;
    }
}

template <typename T>
std::string ROperator_BatchNorm_CUDA<T>::Generate(std::string OpName)
{
    OpName = "op_" + OpName;
    if (fInputShape.empty()) {
        throw std::runtime_error("TMVA SOFIE Operator BatchNorm CUDA called to Generate without being initialized first");
    }

    std::stringstream out;

    // Calculate dimensions
    size_t batchSize = fInputShape[0];
    size_t spatialSize = 1;
    for (size_t i = 2; i < fInputShape.size(); i++) {
        spatialSize *= fInputShape[i];
    }
    size_t totalSize = batchSize * fNumChannels * spatialSize;

    // Begin code generation
    out << "\n//------ BatchNorm CUDA (epsilon=" << std::scientific << fEpsilon << ")\n";

    // Define the CUDA kernel
    out << SP << "// CUDA kernel for BatchNorm operation\n";
    out << SP << "__global__ void " << OpName << "_batchnorm_kernel(const " << fType << "* input, "
        << "const " << fType << "* scale, const " << fType << "* bias, "
        << "const " << fType << "* mean, const " << fType << "* var, "
        << fType << "* output, size_t N, size_t C, size_t spatialSize, " << fType << " epsilon) {\n";

    out << SP << SP << "// Calculate global thread index\n";
    out << SP << SP << "int idx = blockIdx.x * blockDim.x + threadIdx.x;\n\n";

    out << SP << SP << "// Total elements = N * C * spatialSize\n";
    out << SP << SP << "int total = N * C * spatialSize;\n\n";

    out << SP << SP << "if (idx < total) {\n";
    out << SP << SP << SP << "// Calculate indices for the current element\n";
    out << SP << SP << SP << "int n = idx / (C * spatialSize);           // Batch index\n";
    out << SP << SP << SP << "int c = (idx / spatialSize) % C;           // Channel index\n";
    out << SP << SP << SP << "int spatialIdx = idx % spatialSize;        // Spatial index\n\n";

    out << SP << SP << SP << "// Apply batch normalization formula\n";
    if (std::is_same<T, float>::value) {
        out << SP << SP << SP << fType << " normalized = (input[idx] - mean[c]) / sqrtf(var[c] + epsilon);\n";
    } else {
        out << SP << SP << SP << fType << " normalized = (input[idx] - mean[c]) / sqrt(var[c] + epsilon);\n";
    }
    out << SP << SP << SP << "output[idx] = scale[c] * normalized + bias[c];\n";
    out << SP << SP << "}\n";
    out << SP << "}\n\n";

    // Set up CUDA execution
    out << SP << "// Set up execution parameters\n";
    out << SP << "size_t N = " << batchSize << ";\n";
    out << SP << "size_t C = " << fNumChannels << ";\n";
    out << SP << "size_t spatialSize = " << spatialSize << ";\n";
    out << SP << "size_t totalSize = " << totalSize << ";\n";
    out << SP << fType << " epsilon = " << std::scientific << fEpsilon << ";\n\n";

    out << SP << "// Set up CUDA execution configuration\n";
    out << SP << "int blockSize = 256;\n";
    out << SP << "int numBlocks = (totalSize + blockSize - 1) / blockSize;\n\n";

    // Allocate device memory
    out << SP << "// Allocate device memory\n";
    out << SP << fType << "* d_input = nullptr;\n";
    out << SP << fType << "* d_scale = nullptr;\n";
    out << SP << fType << "* d_bias = nullptr;\n";
    out << SP << fType << "* d_mean = nullptr;\n";
    out << SP << fType << "* d_var = nullptr;\n";
    out << SP << fType << "* d_output = nullptr;\n";
    out << SP << "cudaError_t cudaStatus;\n\n";

    // Input tensor
    out << SP << "// Allocate and copy input tensor\n";
    out << SP << "cudaStatus = cudaMalloc(&d_input, totalSize * sizeof(" << fType << "));\n";
    out << SP << "if (cudaStatus != cudaSuccess) {\n";
    out << SP << SP << "std::cerr << \"cudaMalloc failed for input: \" << cudaGetErrorString(cudaStatus) << std::endl;\n";
    out << SP << SP << "goto " << OpName << "_cleanup;\n";
    out << SP << "}\n\n";

    out << SP << "cudaStatus = cudaMemcpy(d_input, tensor_" << fNX << ", totalSize * sizeof(" << fType << "), cudaMemcpyHostToDevice);\n";
    out << SP << "if (cudaStatus != cudaSuccess) {\n";
    out << SP << SP << "std::cerr << \"cudaMemcpy failed for input: \" << cudaGetErrorString(cudaStatus) << std::endl;\n";
    out << SP << SP << "goto " << OpName << "_cleanup;\n";
    out << SP << "}\n\n";

    // Scale tensor
    out << SP << "// Allocate and copy scale tensor\n";
    out << SP << "cudaStatus = cudaMalloc(&d_scale, C * sizeof(" << fType << "));\n";
    out << SP << "if (cudaStatus != cudaSuccess) {\n";
    out << SP << SP << "std::cerr << \"cudaMalloc failed for scale: \" << cudaGetErrorString(cudaStatus) << std::endl;\n";
    out << SP << SP << "goto " << OpName << "_cleanup;\n";
    out << SP << "}\n\n";

    out << SP << "cudaStatus = cudaMemcpy(d_scale, tensor_" << fNScale << ", C * sizeof(" << fType << "), cudaMemcpyHostToDevice);\n";
    out << SP << "if (cudaStatus != cudaSuccess) {\n";
    out << SP << SP << "std::cerr << \"cudaMemcpy failed for scale: \" << cudaGetErrorString(cudaStatus) << std::endl;\n";
    out << SP << SP << "goto " << OpName << "_cleanup;\n";
    out << SP << "}\n\n";

    // Bias tensor
    out << SP << "// Allocate and copy bias tensor\n";
    out << SP << "cudaStatus = cudaMalloc(&d_bias, C * sizeof(" << fType << "));\n";
    out << SP << "if (cudaStatus != cudaSuccess) {\n";
    out << SP << SP << "std::cerr << \"cudaMalloc failed for bias: \" << cudaGetErrorString(cudaStatus) << std::endl;\n";
    out << SP << SP << "goto " << OpName << "_cleanup;\n";
    out << SP << "}\n\n";

    out << SP << "cudaStatus = cudaMemcpy(d_bias, tensor_" << fNBias << ", C * sizeof(" << fType << "), cudaMemcpyHostToDevice);\n";
    out << SP << "if (cudaStatus != cudaSuccess) {\n";
    out << SP << SP << "std::cerr << \"cudaMemcpy failed for bias: \" << cudaGetErrorString(cudaStatus) << std::endl;\n";
    out << SP << SP << "goto " << OpName << "_cleanup;\n";
    out << SP << "}\n\n";

    // Mean tensor
    out << SP << "// Allocate and copy mean tensor\n";
    out << SP << "cudaStatus = cudaMalloc(&d_mean, C * sizeof(" << fType << "));\n";
    out << SP << "if (cudaStatus != cudaSuccess) {\n";
    out << SP << SP << "std::cerr << \"cudaMalloc failed for mean: \" << cudaGetErrorString(cudaStatus) << std::endl;\n";
    out << SP << SP << "goto " << OpName << "_cleanup;\n";
    out << SP << "}\n\n";

    out << SP << "cudaStatus = cudaMemcpy(d_mean, tensor_" << fNMean << ", C * sizeof(" << fType << "), cudaMemcpyHostToDevice);\n";
    out << SP << "if (cudaStatus != cudaSuccess) {\n";
    out << SP << SP << "std::cerr << \"cudaMemcpy failed for mean: \" << cudaGetErrorString(cudaStatus) << std::endl;\n";
    out << SP << SP << "goto " << OpName << "_cleanup;\n";
    out << SP << "}\n\n";

    // Var tensor
    out << SP << "// Allocate and copy variance tensor\n";
    out << SP << "cudaStatus = cudaMalloc(&d_var, C * sizeof(" << fType << "));\n";
    out << SP << "if (cudaStatus != cudaSuccess) {\n";
    out << SP << SP << "std::cerr << \"cudaMalloc failed for variance: \" << cudaGetErrorString(cudaStatus) << std::endl;\n";
    out << SP << SP << "goto " << OpName << "_cleanup;\n";
    out << SP << "}\n\n";

    out << SP << "cudaStatus = cudaMemcpy(d_var, tensor_" << fNVar << ", C * sizeof(" << fType << "), cudaMemcpyHostToDevice);\n";
    out << SP << "if (cudaStatus != cudaSuccess) {\n";
    out << SP << SP << "std::cerr << \"cudaMemcpy failed for variance: \" << cudaGetErrorString(cudaStatus) << std::endl;\n";
    out << SP << SP << "goto " << OpName << "_cleanup;\n";
    out << SP << "}\n\n";

    // Output tensor
    out << SP << "// Allocate output tensor\n";
    out << SP << "cudaStatus = cudaMalloc(&d_output, totalSize * sizeof(" << fType << "));\n";
    out << SP << "if (cudaStatus != cudaSuccess) {\n";
    out << SP << SP << "std::cerr << \"cudaMalloc failed for output: \" << cudaGetErrorString(cudaStatus) << std::endl;\n";
    out << SP << SP << "goto " << OpName << "_cleanup;\n";
    out << SP << "}\n\n";

    // Launch kernel
    out << SP << "// Launch BatchNorm kernel\n";
    out << SP << OpName << "_batchnorm_kernel<<<numBlocks, blockSize>>>(d_input, d_scale, d_bias, d_mean, d_var, d_output, N, C, spatialSize, epsilon);\n\n";

    // Check for kernel launch errors
    out << SP << "// Check for kernel launch errors\n";
    out << SP << "cudaStatus = cudaGetLastError();\n";
    out << SP << "if (cudaStatus != cudaSuccess) {\n";
    out << SP << SP << "std::cerr << \"CUDA kernel launch failed: \" << cudaGetErrorString(cudaStatus) << std::endl;\n";
    out << SP << SP << "goto " << OpName << "_cleanup;\n";
    out << SP << "}\n\n";

    // Synchronize
    out << SP << "// Wait for kernel completion\n";
    out << SP << "cudaStatus = cudaDeviceSynchronize();\n";
    out << SP << "if (cudaStatus != cudaSuccess) {\n";
    out << SP << SP << "std::cerr << \"cudaDeviceSynchronize failed: \" << cudaGetErrorString(cudaStatus) << std::endl;\n";
    out << SP << SP << "goto " << OpName << "_cleanup;\n";
    out << SP << "}\n\n";

    // Copy result back to host
    out << SP << "// Copy result back to host\n";
    out << SP << "cudaStatus = cudaMemcpy(tensor_" << fNY << ", d_output, totalSize * sizeof(" << fType << "), cudaMemcpyDeviceToHost);\n";
    out << SP << "if (cudaStatus != cudaSuccess) {\n";
    out << SP << SP << "std::cerr << \"cudaMemcpy to host failed: \" << cudaGetErrorString(cudaStatus) << std::endl;\n";
    out << SP << SP << "goto " << OpName << "_cleanup;\n";
    out << SP << "}\n\n";

    // Cleanup
    out << SP << OpName << "_cleanup:\n";
    out << SP << "// Clean up device memory\n";
    out << SP << "if (d_input) cudaFree(d_input);\n";
    out << SP << "if (d_scale) cudaFree(d_scale);\n";
    out << SP << "if (d_bias) cudaFree(d_bias);\n";
    out << SP << "if (d_mean) cudaFree(d_mean);\n";
    out << SP << "if (d_var) cudaFree(d_var);\n";
    out << SP << "if (d_output) cudaFree(d_output);\n\n";

    // CPU fallback
    out << SP << "// CPU fallback if CUDA execution failed\n";
    out << SP << "if (cudaStatus != cudaSuccess) {\n";
    out << SP << SP << "std::cerr << \"Using CPU fallback for BatchNorm operation\" << std::endl;\n";
    out << SP << SP << "for (size_t n = 0; n < N; n++) {\n";
    out << SP << SP << SP << "for (size_t c = 0; c < C; c++) {\n";
    out << SP << SP << SP << SP << "for (size_t s = 0; s < spatialSize; s++) {\n";
    out << SP << SP << SP << SP << SP << "size_t idx = n * C * spatialSize + c * spatialSize + s;\n";
    out << SP << SP << SP << SP << SP << fType << " normalized = (tensor_" << fNX << "[idx] - tensor_" << fNMean << "[c]) / ";
    if (std::is_same<T, float>::value) {
        out << "sqrtf(tensor_" << fNVar << "[c] + epsilon);\n";
    } else {
        out << "sqrt(tensor_" << fNVar << "[c] + epsilon);\n";
    }
    out << SP << SP << SP << SP << SP << "tensor_" << fNY << "[idx] = tensor_" << fNScale << "[c] * normalized + tensor_" << fNBias << "[c];\n";
    out << SP << SP << SP << SP << "}\n";
    out << SP << SP << SP << "}\n";
    out << SP << SP << "}\n";
    out << SP << "}\n";

    return out.str();
}

// Explicit template instantiations
template class ROperator_BatchNorm_CUDA<float>;
template class ROperator_BatchNorm_CUDA<double>;

}}} // namespace TMVA::Experimental::SOFIE

Writing /content/tmva_cuda_project/src/ROperator_BatchNorm_CUDA.cu


In [5]:
%%writefile /content/tmva_cuda_project/test/test_batchnorm_cuda.cu
#include "TMVA/ROperator_BatchNorm_CUDA.hxx"
#include "TMVA/RModel.hxx"
#include <iostream>
#include <vector>
#include <iomanip>
#include <cmath>
#include <cassert>
#include <algorithm>
#include <numeric>

using namespace TMVA::Experimental::SOFIE;

// Helper function to compare floating point values with tolerance
template <typename T>
bool isClose(T a, T b, T rtol = 1e-5, T atol = 1e-8) {
    return std::abs(a - b) <= atol + rtol * std::abs(b);
}

// Helper function to compare vectors with tolerance
template <typename T>
bool allClose(const std::vector<T>& a, const std::vector<T>& b, T rtol = 1e-5, T atol = 1e-8) {
    if (a.size() != b.size()) return false;
    for (size_t i = 0; i < a.size(); i++) {
        if (!isClose(a[i], b[i], rtol, atol)) {
            std::cout << "Mismatch at index " << i << ": " << a[i] << " vs " << b[i] << std::endl;
            return false;
        }
    }
    return true;
}

// Function to print tensor data
template <typename T>
void printTensor(const std::vector<T>& data, const std::vector<size_t>& shape, const std::string& name) {
    std::cout << name << " tensor shape: [";
    for (size_t i = 0; i < shape.size(); i++) {
        std::cout << shape[i];
        if (i < shape.size() - 1) std::cout << ", ";
    }
    std::cout << "]" << std::endl;

    size_t maxPrint = 10; // Limit print to save space
    std::cout << "Values (first " << maxPrint << " elements): ";
    for (size_t i = 0; i < std::min(data.size(), maxPrint); i++) {
        std::cout << std::fixed << std::setprecision(4) << data[i] << " ";
    }
    if (data.size() > maxPrint) std::cout << "...";
    std::cout << std::endl;
}

// CPU implementation of BatchNorm for comparison
template <typename T>
std::vector<T> cpuBatchNorm(const std::vector<T>& input, const std::vector<T>& scale,
                           const std::vector<T>& bias, const std::vector<T>& mean,
                           const std::vector<T>& var, const std::vector<size_t>& inputShape,
                           T epsilon) {
    std::vector<T> output(input.size());

    // Calculate dimensions
    size_t batchSize = inputShape[0];
    size_t channels = inputShape[1];
    size_t spatialSize = 1;
    for (size_t i = 2; i < inputShape.size(); i++) {
        spatialSize *= inputShape[i];
    }

    // Apply BatchNorm formula
    for (size_t n = 0; n < batchSize; n++) {
        for (size_t c = 0; c < channels; c++) {
            for (size_t s = 0; s < spatialSize; s++) {
                size_t idx = n * channels * spatialSize + c * spatialSize + s;
                T normalized = (input[idx] - mean[c]) / std::sqrt(var[c] + epsilon);
                output[idx] = scale[c] * normalized + bias[c];
            }
        }
    }

    return output;
}

// Test cases for BatchNorm
void testBasicBatchNorm() {
    std::cout << "\n=== Basic BatchNorm Test ===\n" << std::endl;

    // Create a model
    RModel model("bn_basic_test", "2025-03-14");

    // Define input tensor dimensions (NCHW format)
    std::vector<size_t> inputShape = {2, 3, 2, 2};  // Small tensor for easy verification
    const size_t N = inputShape[0];
    const size_t C = inputShape[1];
    const size_t H = inputShape[2];
    const size_t W = inputShape[3];
    const size_t spatialSize = H * W;
    const size_t totalSize = N * C * spatialSize;

    // Create input tensor with specific pattern for verification
    std::vector<float> inputData(totalSize);
    for (size_t i = 0; i < totalSize; i++) {
        inputData[i] = static_cast<float>(i) / 10.0f - 1.0f;  // Range: -1 to ~2.3
    }

    // Create parameters with specific values for verification
    std::vector<float> scaleData = {1.0f, 2.0f, 0.5f};
    std::vector<float> biasData = {0.0f, 0.1f, -0.1f};
    std::vector<float> meanData = {0.5f, 0.0f, -0.5f};
    std::vector<float> varData = {1.0f, 1.5f, 2.0f};

    // Add tensors to model
    model.AddInputTensorInfo("input", ETensorType::FLOAT, inputShape);
    model.AddInputTensorInfo("scale", ETensorType::FLOAT, {C});
    model.AddInputTensorInfo("bias", ETensorType::FLOAT, {C});
    model.AddInputTensorInfo("mean", ETensorType::FLOAT, {C});
    model.AddInputTensorInfo("var", ETensorType::FLOAT, {C});

    // Set epsilon and initialize the model
    float epsilon = 1e-5f;
    model.Initialize();

    // Create BatchNorm CUDA operator
    ROperator_BatchNorm_CUDA<float> batchNormOp("input", "scale", "bias", "mean", "var", "output", epsilon);

    // Initialize operator
    batchNormOp.Initialize(model);

    // Generate code
    std::string generatedCode = batchNormOp.Generate("TestBatchNorm");

    // Verify generated code contains expected elements
    std::vector<std::string> expectedStrings = {
        "BatchNorm CUDA",
        "epsilon",
        "batchnorm_kernel",
        "cudaMalloc",
        "cudaMemcpy"
    };

    for (const auto& str : expectedStrings) {
        if (generatedCode.find(str) == std::string::npos) {
            std::cerr << "Generated code missing expected string: " << str << std::endl;
            assert(false);
        }
    }

    // Calculate reference output using CPU implementation
    auto expectedOutput = cpuBatchNorm(inputData, scaleData, biasData, meanData, varData, inputShape, epsilon);

    // Verify specific output values (we can only check the CPU fallback logic)
    std::cout << "Verifying CPU fallback calculations..." << std::endl;

    // Check a few specific indices
    for (size_t c = 0; c < C; c++) {
        // First element of each channel in first batch
        size_t idx = c * spatialSize;
        float input_val = inputData[idx];
        float normalized = (input_val - meanData[c]) / std::sqrt(varData[c] + epsilon);
        float expected_val = scaleData[c] * normalized + biasData[c];

        std::cout << "Channel " << c << ": ";
        std::cout << "Input=" << input_val << ", ";
        std::cout << "Expected output=" << expected_val << ", ";
        std::cout << "Calculated=" << expectedOutput[idx] << std::endl;

        // Verify calculation
        assert(isClose(expected_val, expectedOutput[idx]));
    }

    std::cout << "\nBasic BatchNorm test passed!" << std::endl;
}

// Test edge cases
void testBatchNormEdgeCases() {
    std::cout << "\n=== BatchNorm Edge Cases Test ===\n" << std::endl;

    // Test with very small variance (near zero)
    {
        std::cout << "Testing with small variance values..." << std::endl;

        RModel model("bn_small_var_test", "2025-03-14");
        std::vector<size_t> inputShape = {1, 2, 1, 1};  // Minimal shape
        const size_t C = inputShape[1];

        std::vector<float> inputData = {0.5f, -0.5f};
        std::vector<float> scaleData = {1.0f, 1.0f};
        std::vector<float> biasData = {0.0f, 0.0f};
        std::vector<float> meanData = {0.0f, 0.0f};
        std::vector<float> varData = {1e-8f, 1e-8f};  // Very small variance

        model.AddInputTensorInfo("input", ETensorType::FLOAT, inputShape);
        model.AddInputTensorInfo("scale", ETensorType::FLOAT, {C});
        model.AddInputTensorInfo("bias", ETensorType::FLOAT, {C});
        model.AddInputTensorInfo("mean", ETensorType::FLOAT, {C});
        model.AddInputTensorInfo("var", ETensorType::FLOAT, {C});

        float epsilon = 1e-5f;  // Epsilon will prevent division by zero
        model.Initialize();

        ROperator_BatchNorm_CUDA<float> batchNormOp("input", "scale", "bias", "mean", "var", "output", epsilon);
        batchNormOp.Initialize(model);
        std::string generatedCode = batchNormOp.Generate("TestSmallVar");

        // Verify the code includes epsilon in the correct places
        if (generatedCode.find("var[c] + epsilon") == std::string::npos) {
            std::cerr << "Generated code doesn't properly handle epsilon for small variance" << std::endl;
            assert(false);
        }

        auto expectedOutput = cpuBatchNorm(inputData, scaleData, biasData, meanData, varData, inputShape, epsilon);
        std::cout << "Small variance test passed!" << std::endl;
    }

    // Test with different dimensions (3D tensor)
    {
        std::cout << "Testing with 3D tensor (NCH)..." << std::endl;

        RModel model("bn_3d_test", "2025-03-14");
        std::vector<size_t> inputShape = {2, 3, 4};  // 3D tensor
        const size_t C = inputShape[1];
        const size_t totalSize = 2 * 3 * 4;

        std::vector<float> inputData(totalSize);
        std::iota(inputData.begin(), inputData.end(), 0.0f);  // Fill with 0, 1, 2, ...

        std::vector<float> scaleData = {1.0f, 2.0f, 0.5f};
        std::vector<float> biasData = {0.0f, 0.1f, -0.1f};
        std::vector<float> meanData = {0.5f, 0.0f, -0.5f};
        std::vector<float> varData = {1.0f, 1.5f, 2.0f};

        model.AddInputTensorInfo("input", ETensorType::FLOAT, inputShape);
        model.AddInputTensorInfo("scale", ETensorType::FLOAT, {C});
        model.AddInputTensorInfo("bias", ETensorType::FLOAT, {C});
        model.AddInputTensorInfo("mean", ETensorType::FLOAT, {C});
        model.AddInputTensorInfo("var", ETensorType::FLOAT, {C});

        float epsilon = 1e-5f;
        model.Initialize();

        ROperator_BatchNorm_CUDA<float> batchNormOp("input", "scale", "bias", "mean", "var", "output", epsilon);
        batchNormOp.Initialize(model);

        // Verify the operator initializes successfully with different dimensions
        auto expectedOutput = cpuBatchNorm(inputData, scaleData, biasData, meanData, varData, inputShape, epsilon);
        std::cout << "3D tensor test passed!" << std::endl;
    }

    // Test with double precision
    {
        std::cout << "Testing with double precision..." << std::endl;

        RModel model("bn_double_test", "2025-03-14");
        std::vector<size_t> inputShape = {1, 2, 2, 2};
        const size_t C = inputShape[1];
        const size_t totalSize = 1 * 2 * 2 * 2;

        std::vector<double> inputData(totalSize, 1.0);
        std::vector<double> scaleData = {1.0, 2.0};
        std::vector<double> biasData = {0.0, 0.1};
        std::vector<double> meanData = {0.5, 0.0};
        std::vector<double> varData = {1.0, 1.5};

        model.AddInputTensorInfo("input", ETensorType::DOUBLE, inputShape);
        model.AddInputTensorInfo("scale", ETensorType::DOUBLE, {C});
        model.AddInputTensorInfo("bias", ETensorType::DOUBLE, {C});
        model.AddInputTensorInfo("mean", ETensorType::DOUBLE, {C});
        model.AddInputTensorInfo("var", ETensorType::DOUBLE, {C});

        double epsilon = 1e-9;
        model.Initialize();

        ROperator_BatchNorm_CUDA<double> batchNormOp("input", "scale", "bias", "mean", "var", "output", epsilon);
        batchNormOp.Initialize(model);

        std::string generatedCode = batchNormOp.Generate("TestDouble");

        // Verify double precision is used in the CUDA kernel
        if (generatedCode.find("double") == std::string::npos ||
            generatedCode.find("sqrt(") == std::string::npos) {  // double uses sqrt() not sqrtf()
            std::cerr << "Generated code doesn't properly handle double precision" << std::endl;
            assert(false);
        }

        auto expectedOutput = cpuBatchNorm(inputData, scaleData, biasData, meanData, varData, inputShape, epsilon);
        std::cout << "Double precision test passed!" << std::endl;
    }
}

// Performance test
void testBatchNormPerformance() {
    std::cout << "\n=== BatchNorm Performance Test ===\n" << std::endl;

    // Use a more realistic tensor size for performance testing
    RModel model("bn_perf_test", "2025-03-14");
    std::vector<size_t> inputShape = {32, 64, 56, 56};  // Common CNN layer size
    const size_t N = inputShape[0];
    const size_t C = inputShape[1];
    const size_t totalSize = N * C * inputShape[2] * inputShape[3];

    std::cout << "Testing with tensor size: " << totalSize << " elements ("
              << (totalSize * sizeof(float) / (1024.0 * 1024.0)) << " MB)" << std::endl;

    // Generate code and analyze kernel configuration
    float epsilon = 1e-5f;

    ROperator_BatchNorm_CUDA<float> batchNormOp("input", "scale", "bias", "mean", "var", "output", epsilon);
    model.AddInputTensorInfo("input", ETensorType::FLOAT, inputShape);
    model.AddInputTensorInfo("scale", ETensorType::FLOAT, {C});
    model.AddInputTensorInfo("bias", ETensorType::FLOAT, {C});
    model.AddInputTensorInfo("mean", ETensorType::FLOAT, {C});
    model.AddInputTensorInfo("var", ETensorType::FLOAT, {C});

    model.Initialize();
    batchNormOp.Initialize(model);

    std::string generatedCode = batchNormOp.Generate("TestPerf");

    // Extract block size and grid size information from generated code
    size_t blockSize = 256;  // Default in our implementation
    size_t numBlocks = (totalSize + blockSize - 1) / blockSize;

    std::cout << "CUDA kernel configuration:" << std::endl;
    std::cout << "  Block size: " << blockSize << " threads" << std::endl;
    std::cout << "  Grid size: " << numBlocks << " blocks" << std::endl;
    std::cout << "  Total threads: " << blockSize * numBlocks << " (vs " << totalSize << " elements)" << std::endl;

    // Estimate memory usage
    size_t inputMemory = totalSize * sizeof(float);
    size_t paramMemory = C * 4 * sizeof(float);  // scale, bias, mean, var
    size_t outputMemory = totalSize * sizeof(float);
    size_t totalMemory = inputMemory + paramMemory + outputMemory;

    std::cout << "Estimated GPU memory usage:" << std::endl;
    std::cout << "  Input tensor: " << (inputMemory / (1024.0 * 1024.0)) << " MB" << std::endl;
    std::cout << "  Parameters: " << (paramMemory / (1024.0)) << " KB" << std::endl;
    std::cout << "  Output tensor: " << (outputMemory / (1024.0 * 1024.0)) << " MB" << std::endl;
    std::cout << "  Total: " << (totalMemory / (1024.0 * 1024.0)) << " MB" << std::endl;

    std::cout << "Performance test analysis complete!" << std::endl;
}

int main() {
    std::cout << "=== TMVA SOFIE CUDA BatchNorm Unit Tests ===" << std::endl;

    try {
        // Run all test cases
        testBasicBatchNorm();
        testBatchNormEdgeCases();
        testBatchNormPerformance();

        std::cout << "\nAll BatchNorm CUDA tests passed successfully!" << std::endl;
        return 0;
    } catch (const std::exception& e) {
        std::cerr << "Error: " << e.what() << std::endl;
        return 1;
    }
}

Writing /content/tmva_cuda_project/test/test_batchnorm_cuda.cu


In [6]:
%%writefile /content/tmva_cuda_project/CMakeLists.txt
cmake_minimum_required(VERSION 3.10)
project(TMVA_SOFIE_CUDA CUDA CXX)

# Set C++ standard
set(CMAKE_CXX_STANDARD 14)
set(CMAKE_CUDA_STANDARD 14)
set(CMAKE_CUDA_ARCHITECTURES 70)

# Find CUDA
find_package(CUDA REQUIRED)

# Include directories
include_directories(
    ${CMAKE_CURRENT_SOURCE_DIR}/include
    ${CUDA_INCLUDE_DIRS}
)

# Add ReLU CUDA operator
cuda_add_executable(test_relu_cuda
    test/test_relu_cuda.cu
    src/ROperator_Relu_CUDA.cu
)

# Add ELU CUDA operator
cuda_add_executable(test_elu_cuda
    test/test_elu_cuda.cu
    src/ROperator_Elu_CUDA.cu
)

# Add BatchNorm CUDA operator (bonus exercise)
cuda_add_executable(test_batchnorm_cuda
    test/test_batchnorm_cuda.cu
    src/ROperator_BatchNorm_CUDA.cu
)

# Link against CUDA libraries
target_link_libraries(test_relu_cuda ${CUDA_LIBRARIES})
target_link_libraries(test_elu_cuda ${CUDA_LIBRARIES})
target_link_libraries(test_batchnorm_cuda ${CUDA_LIBRARIES})

Writing /content/tmva_cuda_project/CMakeLists.txt


In [7]:
%%writefile /content/tmva_cuda_project/CMakeLists.txt
cmake_minimum_required(VERSION 3.10)
project(TMVA_SOFIE_CUDA CUDA CXX)

# Set C++ standard
set(CMAKE_CXX_STANDARD 14)
set(CMAKE_CUDA_STANDARD 14)
set(CMAKE_CUDA_ARCHITECTURES 70)

# Find CUDA
find_package(CUDA REQUIRED)

# Include directories
include_directories(
    ${CMAKE_CURRENT_SOURCE_DIR}/include
    ${CUDA_INCLUDE_DIRS}
)

# Add BatchNorm CUDA operator (bonus exercise)
cuda_add_executable(test_batchnorm_cuda
    test/test_batchnorm_cuda.cu
    src/ROperator_BatchNorm_CUDA.cu
)

# Link against CUDA libraries
target_link_libraries(test_batchnorm_cuda ${CUDA_LIBRARIES})

Overwriting /content/tmva_cuda_project/CMakeLists.txt


In [8]:
%%writefile /content/tmva_cuda_project/include/TMVA/ROperator.hxx
#ifndef TMVA_SOFIE_ROPERATOR
#define TMVA_SOFIE_ROPERATOR

#include "TMVA/SOFIE_common.hxx"
#include <string>
#include <vector>
#include <memory>

namespace TMVA {
namespace Experimental {
namespace SOFIE {

// Forward declaration
class RModel;

// Base class for all operators
class ROperator {
public:
    virtual ~ROperator() = default;

    // Core required methods
    virtual std::vector<std::vector<size_t>> ShapeInference(std::vector<std::vector<size_t>> input) = 0;
    virtual std::vector<ETensorType> TypeInference(std::vector<ETensorType> input) = 0;
    virtual void Initialize(RModel& model) = 0;
    virtual std::string Generate(std::string OpName) = 0;

    // Optional session-related methods (can be empty in mock implementation)
    virtual std::string GenerateInitCode() { return ""; }
    virtual std::string GenerateDeclCode() { return ""; }
    virtual std::string GenerateSessionMembersCode(std::string /*opName*/) { return ""; }
    virtual std::string Header() { return ""; }
    virtual std::vector<std::string> GetBlasRoutines() { return {}; }
    virtual std::vector<std::string> GetStdLibs() { return {}; }

    // Common members
    std::vector<std::string> fInputTensorNames;
    std::vector<std::string> fOutputTensorNames;

protected:
    const std::string SP = "   "; // Space for indentation
    bool fUseSession = false;     // Flag for session usage
    bool fIsOutputConstant = false; // Flag for constant output tensors
};

}}} // namespace TMVA::Experimental::SOFIE

#endif // TMVA_SOFIE_ROPERATOR

Writing /content/tmva_cuda_project/include/TMVA/ROperator.hxx


In [9]:
%%writefile /content/tmva_cuda_project/include/TMVA/SOFIE_common.hxx
#ifndef TMVA_SOFIE_COMMON
#define TMVA_SOFIE_COMMON

#include <string>
#include <vector>
#include <memory>
#include <stdexcept>
#include <iostream>
#include <unordered_map>
#include <functional>
#include <algorithm>

namespace TMVA {
namespace Experimental {
namespace SOFIE {

// Basic tensor type enum (matching official implementation)
enum class ETensorType {
    UNDEFINED = 0, FLOAT = 1, UNINT8 = 2, INT8 = 3, UINT16 = 4, INT16 = 5,
    INT32 = 6, INT64 = 7, STRING = 8, BOOL = 9, FLOAT16 = 10, DOUBLE = 11,
    UINT32 = 12, UINT64 = 13, COMPLEX64 = 14, COMPLEX28 = 15, BFLOAT16 = 16
};

// Dimension structure for dynamic shapes
struct Dim {
    bool isParam = false;
    size_t dim = 0;
    std::string param;

    // Constructors
    Dim() {}
    Dim(const std::string& p, size_t d = 0) : isParam(true), dim(d), param(p) {}
    Dim(size_t d) : dim(d) {}

    std::string GetVal() const {
        return (isParam) ? param : std::to_string(dim);
    }
};

struct InputTensorInfo {
    ETensorType type;
    std::vector<Dim> shape;
};

struct TensorInfo {
    ETensorType type;
    std::vector<size_t> shape;
};

struct DynamicTensorInfo {
    ETensorType type;
    std::vector<Dim> shape;
};

// Helper functions - add inline to prevent multiple definition errors
inline size_t ConvertShapeToLength(const std::vector<size_t>& shape) {
    size_t length = 1;
    for (auto& dim : shape) {
        length *= dim;
    }
    return length;
}

inline std::string ConvertShapeToString(std::vector<size_t> shape) {
    std::string result = "{";
    for (size_t i = 0; i < shape.size(); i++) {
        result += std::to_string(shape[i]);
        if (i < shape.size() - 1) result += ", ";
    }
    result += "}";
    return result;
}

// Get string representation of type
template<typename T>
inline std::string GetTensorTypeName() {
    if (std::is_same<T, float>::value) return "float";
    if (std::is_same<T, double>::value) return "double";
    if (std::is_same<T, int64_t>::value) return "int64_t";
    if (std::is_same<T, int32_t>::value) return "int32_t";
    if (std::is_same<T, bool>::value) return "bool";
    return "unknown";
}

// Get ETensorType from C++ type
template<typename T>
ETensorType GetTemplatedType(T) {
    if (std::is_same<T, float>::value) return ETensorType::FLOAT;
    if (std::is_same<T, double>::value) return ETensorType::DOUBLE;
    if (std::is_same<T, int64_t>::value) return ETensorType::INT64;
    if (std::is_same<T, int32_t>::value) return ETensorType::INT32;
    if (std::is_same<T, bool>::value) return ETensorType::BOOL;
    throw std::runtime_error("Unsupported type in GetTemplatedType");
}

// Simple initialized tensor class - simplified version of the official one
class InitializedTensor {
public:
    InitializedTensor() = default;
    InitializedTensor(ETensorType type, const std::vector<size_t>& shape,
                     std::shared_ptr<void> data, bool constant = false)
        : fConstant(constant), fType(type), fShape(shape), fData(data) {}

    ETensorType const &type() const { return fType; }
    std::vector<std::size_t> const &shape() const { return fShape; }
    std::shared_ptr<void> const &sharedptr() const { return fData; }

    // Additional flags to match official behavior
    bool IsConstantTensor() const { return fConstant; }
    bool IsWeightTensor() const { return !fConstant && !fIsNotWritable; }
    void SetNotWritable() { fIsNotWritable = true; }

    template <class T = void>
    T const *data() const {
        return static_cast<T const *>(fData.get());
    }

private:
    bool fConstant = false;      // Flag for constant tensors
    bool fIsNotWritable = false; // Flag for not writable tensors
    ETensorType fType;
    std::vector<size_t> fShape;
    std::shared_ptr<void> fData;
};

// Utility namespace
namespace UTILITY {
    inline std::string Clean_name(const std::string& name) {
        return name; // Simplified for testing
    }

    // Check if two shapes are equal
    inline bool AreSameShape(const std::vector<size_t>& a, const std::vector<size_t>& b) {
        if (a.size() != b.size()) return false;
        for (size_t i = 0; i < a.size(); i++) {
            if (a[i] != b[i]) return false;
        }
        return true;
    }
}

}}} // namespace TMVA::Experimental::SOFIE

#endif // TMVA_SOFIE_COMMON

Writing /content/tmva_cuda_project/include/TMVA/SOFIE_common.hxx


In [10]:
%%writefile /content/tmva_cuda_project/include/TMVA/RModel.hxx
#ifndef TMVA_SOFIE_RMODEL
#define TMVA_SOFIE_RMODEL

#include "TMVA/SOFIE_common.hxx"
#include <unordered_map>
#include <string>
#include <vector>
#include <memory>
#include <map>
#include <algorithm>

namespace TMVA {
namespace Experimental {
namespace SOFIE {

// Forward declaration
class ROperator;

// Mock RModel class for our implementation
class RModel {
private:
    std::string fName;
    std::string fParsedDateTime;
    bool fIsInitialized = false;
    int fVerbose = 1;

    // Tensor storage
    std::unordered_map<std::string, ETensorType> fTensorTypes;
    std::unordered_map<std::string, std::vector<size_t>> fTensorShapes;
    std::vector<std::string> fOutputTensorNames;
    std::vector<std::string> fInputTensorNames;

public:
    RModel() = default;
    RModel(std::string name, std::string parsedtime) : fName(name), fParsedDateTime(parsedtime) {}

    int Verbose() const { return fVerbose; }

    const std::vector<size_t>& GetTensorShape(const std::string& name) {
        auto it = fTensorShapes.find(name);
        if (it != fTensorShapes.end()) {
            return it->second;
        }
        throw std::runtime_error("Tensor not found: " + name);
    }

    const ETensorType& GetTensorType(const std::string& name) {
        auto it = fTensorTypes.find(name);
        if (it != fTensorTypes.end()) {
            return it->second;
        }
        throw std::runtime_error("Tensor type not found: " + name);
    }

    bool CheckIfTensorAlreadyExist(const std::string& name) const {
        return fTensorShapes.find(name) != fTensorShapes.end();
    }

    // Add input tensor info
    void AddInputTensorInfo(const std::string& name, ETensorType type, const std::vector<size_t>& shape) {
        fTensorTypes[name] = type;
        fTensorShapes[name] = shape;

        // Also add to input tensor names if not already there
        if (std::find(fInputTensorNames.begin(), fInputTensorNames.end(), name) == fInputTensorNames.end()) {
            fInputTensorNames.push_back(name);
        }
    }

    // Add intermediate tensor
    void AddIntermediateTensor(const std::string& name, ETensorType type, const std::vector<size_t>& shape) {
        fTensorTypes[name] = type;
        fTensorShapes[name] = shape;
    }

    // Add output tensor names
    void AddOutputTensorNameList(const std::vector<std::string>& names) {
        fOutputTensorNames = names;
    }

    // Initialize model (simplified for mock)
    void Initialize(int batchSize = -1) {
        fIsInitialized = true;

        if (Verbose()) {
            std::cout << "Model initialized with batch size: " <<
                (batchSize == -1 ? "default" : std::to_string(batchSize)) << std::endl;
        }
    }
};

}}} // namespace TMVA::Experimental::SOFIE

#endif // TMVA_SOFIE_RMODEL

Writing /content/tmva_cuda_project/include/TMVA/RModel.hxx


In [11]:
!cd /content/tmva_cuda_project && cmake -B build && cmake --build build


-- The CUDA compiler identification is NVIDIA 12.5.82 with host compiler GNU 11.4.0
-- The CXX compiler identification is GNU 11.4.0
-- Detecting CUDA compiler ABI info
-- Detecting CUDA compiler ABI info - done
-- Check for working CUDA compiler: /usr/local/cuda/bin/nvcc - skipped
-- Detecting CUDA compile features
-- Detecting CUDA compile features - done
-- Detecting CXX compiler ABI info
-- Detecting CXX compiler ABI info - done
-- Check for working CXX compiler: /usr/bin/c++ - skipped
-- Detecting CXX compile features
-- Detecting CXX compile features - done
  Policy CMP0146 is not set: The FindCUDA module is removed.  Run "cmake
  --help-policy CMP0146" for policy details.  Use the cmake_policy command to

[0m
-- Performing Test CMAKE_HAVE_LIBC_PTHREAD
-- Performing Test CMAKE_HAVE_LIBC_PTHREAD - Success
-- Found Threads: TRUE
-- Found CUDA: /usr/local/cuda (found version "12.5")
-- Configuring done (4.2s)
-- Generating done (0.0s)
-- Build files have been written to: /content/t

In [12]:
!cd /content/tmva_cuda_project/build && ./test_batchnorm_cuda

=== TMVA SOFIE CUDA BatchNorm Unit Tests ===

=== Basic BatchNorm Test ===

Model initialized with batch size: default
BatchNorm CUDA: input -> output (epsilon=1e-05, channels=3, spatial_dims=2)
Verifying CPU fallback calculations...
Channel 0: Input=-1, Expected output=-1.49999, Calculated=-1.49999
Channel 1: Input=-0.6, Expected output=-0.879793, Calculated=-0.879793
Channel 2: Input=-0.2, Expected output=0.00606576, Calculated=0.00606576

Basic BatchNorm test passed!

=== BatchNorm Edge Cases Test ===

Testing with small variance values...
Model initialized with batch size: default
BatchNorm CUDA: input -> output (epsilon=1e-05, channels=2, spatial_dims=2)
Small variance test passed!
Testing with 3D tensor (NCH)...
Model initialized with batch size: default
BatchNorm CUDA: input -> output (epsilon=1e-05, channels=3, spatial_dims=1)
3D tensor test passed!
Testing with double precision...
Model initialized with batch size: default
BatchNorm CUDA: input -> output (epsilon=1e-09, channe