In [1]:
# Step 1: Download the pre-built ROOT tarball from GitHub Releases
!wget -q --show-progress https://github.com/MohamedElashri/ROOT/releases/download/ubuntu/root_v6.30.04_Ubuntu_Python3.11.zip
# Step 2: Extract the ROOT files
!unzip -q root_v6.30.04_Ubuntu_Python3.11.zip

# Step 3: Install missing system dependencies for ROOT
!sudo ldconfig & apt-get install -y git dpkg-dev cmake g++ gcc binutils libx11-dev libxpm-dev libxft-dev libxext-dev tar gfortran subversion libpython3.11-dev

# Step 4: Remove the tarball to free up space
!rm -f root_v6.30.04_Ubuntu_Python3.11.zip

# Step 5: Install Compatible libssl

!wget http://archive.ubuntu.com/ubuntu/pool/main/o/openssl/libssl1.1_1.1.1f-1ubuntu2_amd64.deb
!sudo dpkg -i libssl1.1_1.1.1f-1ubuntu2_amd64.deb
!rm -f libssl1.1_1.1.1f-1ubuntu2_amd64.deb


/sbin/ldconfig.real: /usr/local/lib/libtbbmalloc.so.2 is not a symbolic link

/sbin/ldconfig.real: /usr/local/lib/libtbbmalloc_proxy.so.2 is not a symbolic link

/sbin/ldconfig.real: /usr/local/lib/libur_loader.so.0 is not a symbolic link

/sbin/ldconfig.real: /usr/local/lib/libtbbbind.so.3 is not a symbolic link

/sbin/ldconfig.real: /usr/local/lib/libur_adapter_opencl.so.0 is not a symbolic link

/sbin/ldconfig.real: /usr/local/lib/libtcm_debug.so.1 is not a symbolic link

/sbin/ldconfig.real: /usr/local/lib/libumf.so.0 is not a symbolic link

/sbin/ldconfig.real: /usr/local/lib/libtcm.so.1 is not a symbolic link

/sbin/ldconfig.real: /usr/local/lib/libtbbbind_2_0.so.3 is not a symbolic link

/sbin/ldconfig.real: /usr/local/lib/libtbbbind_2_5.so.3 is not a symbolic link

/sbin/ldconfig.real: /usr/local/lib/libtbb.so.12 is not a symbolic link

/sbin/ldconfig.real: /usr/local/lib/libur_adapter_level_zero.so.0 is not a symbolic link

/sbin/ldconfig.real: /usr/local/lib/libhwloc.so.15 is

In [2]:
!mkdir -p /content/tmva_cuda_project/include/TMVA
!mkdir -p /content/tmva_cuda_project/src
!mkdir -p /content/tmva_cuda_project/test
!mkdir -p /content/tmva_cuda_project/build

In [3]:
%%writefile /content/tmva_cuda_project/include/TMVA/ROperator_Elu_CUDA.hxx
#ifndef TMVA_SOFIE_ROPERATOR_ELU_CUDA
#define TMVA_SOFIE_ROPERATOR_ELU_CUDA

#include "TMVA/RModel.hxx"
#include "TMVA/ROperator.hxx"
#include "TMVA/SOFIE_common.hxx"
#include <cuda_runtime.h>
#include <vector>
#include <string>

namespace TMVA {
namespace Experimental {
namespace SOFIE {

template <typename T>
class ROperator_Elu_CUDA final : public ROperator
{
private:
   /* Attributes */
   float falpha = 1.0; // default value
   std::string fNX;
   std::string fNY;
   std::vector<size_t> fShape;
   std::string fType;

public:
   ROperator_Elu_CUDA() {}

   ROperator_Elu_CUDA(float alpha, std::string nameX, std::string nameY):
      falpha(alpha), fNX(UTILITY::Clean_name(nameX)), fNY(UTILITY::Clean_name(nameY))
   {
      fInputTensorNames = { fNX };
      fOutputTensorNames = { fNY };

      if (std::is_same<T, float>::value) {
         fType = "float";
      } else if (std::is_same<T, double>::value) {
         fType = "double";
      } else {
         throw std::runtime_error("TMVA SOFIE Encountered unsupported type parsing a Elu CUDA operator");
      }
   }

   std::vector<ETensorType> TypeInference(std::vector<ETensorType> input) override {
      return input;
   }

   std::vector<std::vector<size_t>> ShapeInference(std::vector<std::vector<size_t>> input) override {
      auto ret = input; // suggest copy to compiler
      return ret;
   }

   // Required ROperator interface methods
   void Initialize(RModel& model) override;
   std::string Generate(std::string OpName) override;
};

// Declare template specializations
extern template class ROperator_Elu_CUDA<float>;
extern template class ROperator_Elu_CUDA<double>;

}}} // namespace TMVA::Experimental::SOFIE

#endif // TMVA_SOFIE_ROPERATOR_ELU_CUDA

Writing /content/tmva_cuda_project/include/TMVA/ROperator_Elu_CUDA.hxx


In [4]:
%%writefile /content/tmva_cuda_project/test/test_elu_cuda.cu
#include "TMVA/ROperator_Elu_CUDA.hxx"
#include "TMVA/RModel.hxx"
#include <iostream>
#include <vector>
#include <chrono>
#include <iomanip>
#include <cmath>

using namespace TMVA::Experimental::SOFIE;

// Function to print tensor data
template <typename T>
void printTensor(const std::vector<T>& data, const std::vector<size_t>& shape) {
    if (shape.size() == 1) {
        for (size_t i = 0; i < std::min(data.size(), size_t(10)); i++) {
            std::cout << std::fixed << std::setprecision(4) << data[i] << " ";
        }
        if (data.size() > 10) std::cout << "...";
        std::cout << std::endl;
    } else if (shape.size() == 2) {
        for (size_t i = 0; i < std::min(shape[0], size_t(5)); i++) {
            for (size_t j = 0; j < std::min(shape[1], size_t(10)); j++) {
                std::cout << std::fixed << std::setprecision(4) << data[i * shape[1] + j] << " ";
            }
            if (shape[1] > 10) std::cout << "...";
            std::cout << std::endl;
        }
        if (shape[0] > 5) std::cout << "..." << std::endl;
    }
}

// CPU implementation of ELU for comparison
template <typename T>
std::vector<T> cpuELU(const std::vector<T>& input, T alpha) {
    std::vector<T> output(input.size());
    for (size_t i = 0; i < input.size(); i++) {
        T x = input[i];
        output[i] = x >= 0 ? x : alpha * (std::exp(x) - 1);
    }
    return output;
}

int main() {
    std::cout << "Testing TMVA SOFIE CUDA ELU Operator" << std::endl;
    std::cout << "====================================" << std::endl;

    try {
        // Create a model
        RModel model("cuda_elu_test", "2025-03-14");

        // Create input tensor with range of values
        std::vector<size_t> shape = {5, 3};
        std::vector<float> input_data = {
            // Values from -2.0 to 2.0 to show ELU behavior
            -2.0f, -1.5f, -1.0f,
            -0.5f, 0.0f, 0.5f,
            1.0f, 1.5f, 2.0f,
            -0.3f, -0.2f, -0.1f,
            0.1f, 0.2f, 0.3f
        };

        // Add input tensor to model
        model.AddInputTensorInfo("input", ETensorType::FLOAT, shape);

        // Initialize the model
        model.Initialize();

        // Test with different alpha values
        float alpha_values[] = {0.1f, 1.0f, 2.0f};

        for (float alpha : alpha_values) {
            std::cout << "\n---- Testing with alpha = " << alpha << " ----" << std::endl;

            // Create ELU CUDA operator
            ROperator_Elu_CUDA<float> eluOp(alpha, "input", "output");

            // Initialize operator
            eluOp.Initialize(model);

            // Generate code
            std::string generatedCode = eluOp.Generate("TestElu");

            // Print the generated code (excerpt)
            std::cout << "\nGenerated CUDA code (excerpt):" << std::endl;
            std::cout << generatedCode.substr(0, 250) << "...\n" << std::endl;

            // Calculate expected output using CPU implementation
            auto expected_output = cpuELU(input_data, alpha);

            // Print input and expected output
            std::cout << "Input tensor:" << std::endl;
            printTensor(input_data, shape);

            std::cout << "\nExpected output tensor (CPU ELU with alpha=" << alpha << "):" << std::endl;
            printTensor(expected_output, shape);

            // Highlight specific values to demonstrate ELU behavior
            std::cout << "\nELU behavior examples (alpha=" << alpha << "):" << std::endl;
            std::cout << "For x = -1.0: ELU(x) = " << (alpha * (std::exp(-1.0f) - 1.0f)) << std::endl;
            std::cout << "For x = 0.0: ELU(x) = " << 0.0f << std::endl;
            std::cout << "For x = 1.0: ELU(x) = " << 1.0f << std::endl;
        }

        std::cout << "\nELU CUDA operator test completed successfully!" << std::endl;

        return 0;
    } catch (const std::exception& e) {
        std::cerr << "Error: " << e.what() << std::endl;
        return 1;
    }
}

Writing /content/tmva_cuda_project/test/test_elu_cuda.cu


In [5]:
%%writefile /content/tmva_cuda_project/CMakeLists.txt
cmake_minimum_required(VERSION 3.10)
project(TMVA_SOFIE_CUDA CUDA CXX)

# Set C++ standard
set(CMAKE_CXX_STANDARD 14)
set(CMAKE_CUDA_STANDARD 14)
set(CMAKE_CUDA_ARCHITECTURES 70)

# Find CUDA
find_package(CUDA REQUIRED)

# Include directories
include_directories(
    ${CMAKE_CURRENT_SOURCE_DIR}/include
    ${CUDA_INCLUDE_DIRS}
)



# Add ELU CUDA operator
cuda_add_executable(test_elu_cuda
    test/test_elu_cuda.cu
    src/ROperator_Elu_CUDA.cu
)

# Link against CUDA libraries

target_link_libraries(test_elu_cuda ${CUDA_LIBRARIES})

Writing /content/tmva_cuda_project/CMakeLists.txt


In [6]:

%%writefile /content/tmva_cuda_project/include/TMVA/ROperator_Elu_CUDA.hxx
#ifndef TMVA_SOFIE_ROPERATOR_ELU_CUDA
#define TMVA_SOFIE_ROPERATOR_ELU_CUDA

#include "TMVA/ROperator.hxx"
#include "TMVA/SOFIE_common.hxx"
#include <cuda_runtime.h>
#include <vector>
#include <string>

namespace TMVA {
namespace Experimental {
namespace SOFIE {

template <typename T>
class ROperator_Elu_CUDA final : public ROperator
{
private:
   /* Attributes */
   float falpha = 1.0; // default value
   std::string fNX;
   std::string fNY;
   std::vector<size_t> fShape;
   std::string fType;

public:
   ROperator_Elu_CUDA() {}

   ROperator_Elu_CUDA(float alpha, std::string nameX, std::string nameY):
      falpha(alpha), fNX(UTILITY::Clean_name(nameX)), fNY(UTILITY::Clean_name(nameY))
   {
      fInputTensorNames = { fNX };
      fOutputTensorNames = { fNY };

      if (std::is_same<T, float>::value) {
         fType = "float";
      } else if (std::is_same<T, double>::value) {
         fType = "double";
      } else {
         throw std::runtime_error("TMVA SOFIE Encountered unsupported type parsing a Elu CUDA operator");
      }
   }

   std::vector<ETensorType> TypeInference(std::vector<ETensorType> input) {
      return input;
   }

   std::vector<std::vector<size_t>> ShapeInference(std::vector<std::vector<size_t>> input) {
      auto ret = input; // suggest copy to compiler
      return ret;
   }

   // Required ROperator interface methods
   void Initialize(RModel& model) override;
   std::string Generate(std::string OpName) override;
};

// Declare template specializations
extern template class ROperator_Elu_CUDA<float>;
extern template class ROperator_Elu_CUDA<double>;

}}} // namespace TMVA::Experimental::SOFIE

#endif // TMVA_SOFIE_ROPERATOR_ELU_CUDA

Overwriting /content/tmva_cuda_project/include/TMVA/ROperator_Elu_CUDA.hxx


In [7]:

%%writefile /content/tmva_cuda_project/src/ROperator_Elu_CUDA.cu
#include "TMVA/ROperator_Elu_CUDA.hxx"
#include "TMVA/RModel.hxx"  // Explicitly include RModel.hxx here
#include <sstream>
#include <cmath>
#include <iomanip>
#include <limits>

// CUDA kernel for ELU with float
__global__ void eluKernelFloat(const float* input, float* output, size_t size, float alpha) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < size) {
        float x = input[idx];
        output[idx] = x >= 0.0f ? x : alpha * (expf(x) - 1.0f);
    }
}

// CUDA kernel for ELU with double precision
__global__ void eluKernelDouble(const double* input, double* output, size_t size, double alpha) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < size) {
        double x = input[idx];
        output[idx] = x >= 0.0 ? x : alpha * (exp(x) - 1.0);
    }
}

namespace TMVA {
namespace Experimental {
namespace SOFIE {

template <typename T>
void ROperator_Elu_CUDA<T>::Initialize(RModel& model)
{
    if (model.CheckIfTensorAlreadyExist(fNX) == false) {
        throw std::runtime_error("TMVA SOFIE Elu CUDA Op Input Tensor " + fNX + " is not found in model");
    }

    // Get shape from the model
    fShape = model.GetTensorShape(fNX);

    // Add output tensor to the model with same type and shape as input
    model.AddIntermediateTensor(fNY, model.GetTensorType(fNX), fShape);

    if (model.Verbose()) {
        std::cout << "ELU CUDA: " << fNX << " -> " << fNY << " (alpha=" << falpha << ")" << std::endl;
    }
}

template <typename T>
std::string ROperator_Elu_CUDA<T>::Generate(std::string OpName)
{
    OpName = "op_" + OpName;
    if (fShape.empty()) {
        throw std::runtime_error("TMVA SOFIE Operator Elu CUDA called to Generate without being initialized first");
    }

    std::stringstream out;
    size_t length = ConvertShapeToLength(fShape);

    // Define alpha parameter with full precision
    out << SP << fType << " " << OpName << "_alpha = "
        << std::setprecision(std::numeric_limits<float>::max_digits10) << falpha << ";\n";

    // Begin CUDA implementation
    out << "\n//------ ELU CUDA\n";

    // 1. Define the kernel
    out << SP << "// CUDA kernel for ELU operation\n";
    out << SP << "__global__ void " << OpName << "_elu_kernel(const " << fType << "* input, "
        << fType << "* output, size_t size, " << fType << " alpha) {\n";
    out << SP << SP << "int idx = blockIdx.x * blockDim.x + threadIdx.x;\n";
    out << SP << SP << "if (idx < size) {\n";
    out << SP << SP << SP << fType << " x = input[idx];\n";

    // Type-specific implementation
    if (std::is_same<T, float>::value) {
        out << SP << SP << SP << "output[idx] = x >= 0.0f ? x : alpha * (expf(x) - 1.0f);\n";
    } else if (std::is_same<T, double>::value) {
        out << SP << SP << SP << "output[idx] = x >= 0.0 ? x : alpha * (exp(x) - 1.0);\n";
    }

    out << SP << SP << "}\n";
    out << SP << "}\n\n";

    // 2. Execution code block
    out << SP << "// Calculate execution configuration\n";
    out << SP << "size_t size = " << length << ";\n";
    out << SP << "int blockSize = 256;\n";
    out << SP << "int numBlocks = (size + blockSize - 1) / blockSize;\n\n";

    // GPU Memory allocation
    out << SP << "// Allocate device memory\n";
    out << SP << fType << "* d_input = nullptr;\n";
    out << SP << fType << "* d_output = nullptr;\n";
    out << SP << "cudaError_t cudaStatus;\n\n";

    // Error handling and memory management
    out << SP << "// CUDA memory allocation\n";
    out << SP << "cudaStatus = cudaMalloc(&d_input, size * sizeof(" << fType << "));\n";
    out << SP << "if (cudaStatus != cudaSuccess) {\n";
    out << SP << SP << "std::cerr << \"cudaMalloc failed for input: \" << cudaGetErrorString(cudaStatus) << std::endl;\n";
    out << SP << SP << "goto " << OpName << "_cleanup;\n";
    out << SP << "}\n\n";

    out << SP << "cudaStatus = cudaMalloc(&d_output, size * sizeof(" << fType << "));\n";
    out << SP << "if (cudaStatus != cudaSuccess) {\n";
    out << SP << SP << "std::cerr << \"cudaMalloc failed for output: \" << cudaGetErrorString(cudaStatus) << std::endl;\n";
    out << SP << SP << "goto " << OpName << "_cleanup;\n";
    out << SP << "}\n\n";

    // Copy input to device
    out << SP << "// Copy input to device\n";
    out << SP << "cudaStatus = cudaMemcpy(d_input, tensor_" << fNX << ", size * sizeof(" << fType << "), cudaMemcpyHostToDevice);\n";
    out << SP << "if (cudaStatus != cudaSuccess) {\n";
    out << SP << SP << "std::cerr << \"cudaMemcpy to device failed: \" << cudaGetErrorString(cudaStatus) << std::endl;\n";
    out << SP << SP << "goto " << OpName << "_cleanup;\n";
    out << SP << "}\n\n";

    // Launch kernel
    out << SP << "// Launch ELU kernel\n";
    out << SP << OpName << "_elu_kernel<<<numBlocks, blockSize>>>(d_input, d_output, size, " << OpName << "_alpha);\n\n";

    // Check for kernel errors
    out << SP << "// Check for kernel errors\n";
    out << SP << "cudaStatus = cudaGetLastError();\n";
    out << SP << "if (cudaStatus != cudaSuccess) {\n";
    out << SP << SP << "std::cerr << \"CUDA kernel launch failed: \" << cudaGetErrorString(cudaStatus) << std::endl;\n";
    out << SP << SP << "goto " << OpName << "_cleanup;\n";
    out << SP << "}\n\n";

    // Synchronize
    out << SP << "// Wait for kernel completion\n";
    out << SP << "cudaStatus = cudaDeviceSynchronize();\n";
    out << SP << "if (cudaStatus != cudaSuccess) {\n";
    out << SP << SP << "std::cerr << \"cudaDeviceSynchronize failed: \" << cudaGetErrorString(cudaStatus) << std::endl;\n";
    out << SP << SP << "goto " << OpName << "_cleanup;\n";
    out << SP << "}\n\n";

    // Copy result back to host
    out << SP << "// Copy result back to host\n";
    out << SP << "cudaStatus = cudaMemcpy(tensor_" << fNY << ", d_output, size * sizeof(" << fType << "), cudaMemcpyDeviceToHost);\n";
    out << SP << "if (cudaStatus != cudaSuccess) {\n";
    out << SP << SP << "std::cerr << \"cudaMemcpy to host failed: \" << cudaGetErrorString(cudaStatus) << std::endl;\n";
    out << SP << SP << "goto " << OpName << "_cleanup;\n";
    out << SP << "}\n\n";

    // Cleanup section
    out << SP << OpName << "_cleanup:\n";
    out << SP << "// Clean up device memory\n";
    out << SP << "if (d_input) cudaFree(d_input);\n";
    out << SP << "if (d_output) cudaFree(d_output);\n\n";

    // CPU fallback if CUDA fails
    out << SP << "// CPU fallback if CUDA execution failed\n";
    out << SP << "if (cudaStatus != cudaSuccess) {\n";
    out << SP << SP << "std::cerr << \"Using CPU fallback for ELU operation\" << std::endl;\n";
    out << SP << SP << "for (int id = 0; id < " << length << "; id++) {\n";
    out << SP << SP << SP << "tensor_" << fNY << "[id] = ((tensor_" << fNX << "[id] >= 0) ? tensor_" << fNX;
    out << "[id] : " << OpName << "_alpha * std::exp(tensor_" << fNX << "[id]) - 1);\n";
    out << SP << SP << "}\n";
    out << SP << "}\n";

    return out.str();
}

// Explicit template instantiations
template class ROperator_Elu_CUDA<float>;
template class ROperator_Elu_CUDA<double>;

}}} // namespace TMVA::Experimental::SOFIE

Writing /content/tmva_cuda_project/src/ROperator_Elu_CUDA.cu


In [8]:
%%writefile /content/tmva_cuda_project/test/test_elu_cuda.cu
#include "TMVA/ROperator_Elu_CUDA.hxx"
#include "TMVA/RModel.hxx"
#include <iostream>
#include <vector>
#include <chrono>
#include <iomanip>
#include <cmath>

using namespace TMVA::Experimental::SOFIE;

// Function to print tensor data
template <typename T>
void printTensor(const std::vector<T>& data, const std::vector<size_t>& shape) {
    if (shape.size() == 1) {
        for (size_t i = 0; i < std::min(data.size(), size_t(10)); i++) {
            std::cout << std::fixed << std::setprecision(4) << data[i] << " ";
        }
        if (data.size() > 10) std::cout << "...";
        std::cout << std::endl;
    } else if (shape.size() == 2) {
        for (size_t i = 0; i < std::min(shape[0], size_t(5)); i++) {
            for (size_t j = 0; j < std::min(shape[1], size_t(10)); j++) {
                std::cout << std::fixed << std::setprecision(4) << data[i * shape[1] + j] << " ";
            }
            if (shape[1] > 10) std::cout << "...";
            std::cout << std::endl;
        }
        if (shape[0] > 5) std::cout << "..." << std::endl;
    }
}

// CPU implementation of ELU for comparison
template <typename T>
std::vector<T> cpuELU(const std::vector<T>& input, T alpha) {
    std::vector<T> output(input.size());
    for (size_t i = 0; i < input.size(); i++) {
        T x = input[i];
        output[i] = x >= 0 ? x : alpha * (std::exp(x) - 1);
    }
    return output;
}

int main() {
    std::cout << "Testing TMVA SOFIE CUDA ELU Operator" << std::endl;
    std::cout << "====================================" << std::endl;

    try {
        // Create a model
        RModel model("cuda_elu_test", "2025-03-14");

        // Create input tensor with range of values
        std::vector<size_t> shape = {5, 3};
        std::vector<float> input_data = {
            // Values from -2.0 to 2.0 to show ELU behavior
            -2.0f, -1.5f, -1.0f,
            -0.5f, 0.0f, 0.5f,
            1.0f, 1.5f, 2.0f,
            -0.3f, -0.2f, -0.1f,
            0.1f, 0.2f, 0.3f
        };

        // Add input tensor to model
        model.AddInputTensorInfo("input", ETensorType::FLOAT, shape);

        // Initialize the model
        model.Initialize();

        // Test with different alpha values
        float alpha_values[] = {0.1f, 1.0f, 2.0f};

        for (float alpha : alpha_values) {
            std::cout << "\n---- Testing with alpha = " << alpha << " ----" << std::endl;

            // Create ELU CUDA operator
            ROperator_Elu_CUDA<float> eluOp(alpha, "input", "output");

            // Initialize operator
            eluOp.Initialize(model);

            // Generate code
            std::string generatedCode = eluOp.Generate("TestElu");

            // Print the generated code (excerpt)
            std::cout << "\nGenerated CUDA code (excerpt):" << std::endl;
            std::cout << generatedCode.substr(0, 250) << "...\n" << std::endl;

            // Calculate expected output using CPU implementation
            auto expected_output = cpuELU(input_data, alpha);

            // Print input and expected output
            std::cout << "Input tensor:" << std::endl;
            printTensor(input_data, shape);

            std::cout << "\nExpected output tensor (CPU ELU with alpha=" << alpha << "):" << std::endl;
            printTensor(expected_output, shape);

            // Highlight specific values to demonstrate ELU behavior
            std::cout << "\nELU behavior examples (alpha=" << alpha << "):" << std::endl;
            std::cout << "For x = -1.0: ELU(x) = " << (alpha * (std::exp(-1.0f) - 1.0f)) << std::endl;
            std::cout << "For x = 0.0: ELU(x) = " << 0.0f << std::endl;
            std::cout << "For x = 1.0: ELU(x) = " << 1.0f << std::endl;
        }

        std::cout << "\nELU CUDA operator test completed successfully!" << std::endl;

        return 0;
    } catch (const std::exception& e) {
        std::cerr << "Error: " << e.what() << std::endl;
        return 1;
    }
}

Overwriting /content/tmva_cuda_project/test/test_elu_cuda.cu


In [9]:
%%writefile /content/tmva_cuda_project/include/TMVA/ROperator.hxx
#ifndef TMVA_SOFIE_ROPERATOR
#define TMVA_SOFIE_ROPERATOR

#include "TMVA/SOFIE_common.hxx"
#include <string>
#include <vector>
#include <memory>

namespace TMVA {
namespace Experimental {
namespace SOFIE {

// Forward declaration
class RModel;

// Base class for all operators
class ROperator {
public:
    virtual ~ROperator() = default;

    // Core required methods
    virtual std::vector<std::vector<size_t>> ShapeInference(std::vector<std::vector<size_t>> input) = 0;
    virtual std::vector<ETensorType> TypeInference(std::vector<ETensorType> input) = 0;
    virtual void Initialize(RModel& model) = 0;
    virtual std::string Generate(std::string OpName) = 0;

    // Optional session-related methods (can be empty in mock implementation)
    virtual std::string GenerateInitCode() { return ""; }
    virtual std::string GenerateDeclCode() { return ""; }
    virtual std::string GenerateSessionMembersCode(std::string /*opName*/) { return ""; }
    virtual std::string Header() { return ""; }
    virtual std::vector<std::string> GetBlasRoutines() { return {}; }
    virtual std::vector<std::string> GetStdLibs() { return {}; }

    // Common members
    std::vector<std::string> fInputTensorNames;
    std::vector<std::string> fOutputTensorNames;

protected:
    const std::string SP = "   "; // Space for indentation
    bool fUseSession = false;     // Flag for session usage
    bool fIsOutputConstant = false; // Flag for constant output tensors
};

}}} // namespace TMVA::Experimental::SOFIE

#endif // TMVA_SOFIE_ROPERATOR

Writing /content/tmva_cuda_project/include/TMVA/ROperator.hxx


In [10]:
%%writefile /content/tmva_cuda_project/include/TMVA/RModel.hxx
#ifndef TMVA_SOFIE_RMODEL
#define TMVA_SOFIE_RMODEL

#include "TMVA/SOFIE_common.hxx"
#include "TMVA/ROperator.hxx"
#include <unordered_map>
#include <string>
#include <vector>
#include <memory>
#include <map>
#include <algorithm>

namespace TMVA {
namespace Experimental {
namespace SOFIE {

// Options enum for code generation
enum class Options {
    kDefault = 0,
    kNoSession = 1
};

// Mock RModel class that includes more official-like interfaces
class RModel {
private:
    std::string fName;
    std::string fParsedDateTime;
    bool fIsInitialized = false;
    int fVerbose = 1;
    bool fUseSession = false;

    // Tensor storage
    std::unordered_map<std::string, InputTensorInfo> fInputTensorInfos;
    std::unordered_map<std::string, TensorInfo> fReadyInputTensorInfos;
    std::unordered_map<std::string, InitializedTensor> fInitializedTensors;
    std::unordered_map<std::string, TensorInfo> fIntermediateTensorInfos;
    std::unordered_map<std::string, DynamicTensorInfo> fDynamicTensorInfos;
    std::vector<std::string> fOutputTensorNames;
    std::vector<std::string> fInputTensorNames;

    std::vector<std::unique_ptr<ROperator>> fOperators;

public:
    RModel() = default;
    RModel(std::string name, std::string parsedtime) : fName(name), fParsedDateTime(parsedtime) {}

    int Verbose() const { return fVerbose; }

    const std::vector<size_t>& GetTensorShape(const std::string& name) {
        auto it = fReadyInputTensorInfos.find(name);
        if (it != fReadyInputTensorInfos.end()) {
            return it->second.shape;
        }
        auto it2 = fIntermediateTensorInfos.find(name);
        if (it2 != fIntermediateTensorInfos.end()) {
            return it2->second.shape;
        }
        auto it3 = fInitializedTensors.find(name);
        if (it3 != fInitializedTensors.end()) {
            return it3->second.shape();
        }
        throw std::runtime_error("Tensor not found: " + name);
    }

    const ETensorType& GetTensorType(const std::string& name) {
        auto it = fReadyInputTensorInfos.find(name);
        if (it != fReadyInputTensorInfos.end()) {
            return it->second.type;
        }
        auto it2 = fIntermediateTensorInfos.find(name);
        if (it2 != fIntermediateTensorInfos.end()) {
            return it2->second.type;
        }
        auto it3 = fInitializedTensors.find(name);
        if (it3 != fInitializedTensors.end()) {
            return it3->second.type();
        }
        throw std::runtime_error("Tensor type not found: " + name);
    }

    bool CheckIfTensorAlreadyExist(const std::string& name) {
        return (fReadyInputTensorInfos.find(name) != fReadyInputTensorInfos.end()) ||
               (fIntermediateTensorInfos.find(name) != fIntermediateTensorInfos.end()) ||
               (fInitializedTensors.find(name) != fInitializedTensors.end()) ||
               (fInputTensorInfos.find(name) != fInputTensorInfos.end());
    }

    // Add input tensor info
    void AddInputTensorInfo(const std::string& name, ETensorType type, const std::vector<size_t>& shape) {
        TensorInfo info;
        info.type = type;
        info.shape = shape;
        fReadyInputTensorInfos[name] = info;

        // Also add to input tensor names if not already there
        if (std::find(fInputTensorNames.begin(), fInputTensorNames.end(), name) == fInputTensorNames.end()) {
            fInputTensorNames.push_back(name);
        }
    }

    // Add input tensor with dynamic shape
    void AddInputTensorInfo(const std::string& name, ETensorType type, const std::vector<Dim>& shape) {
        InputTensorInfo info;
        info.type = type;
        info.shape = shape;
        fInputTensorInfos[name] = info;

        // Also add to input tensor names if not already there
        if (std::find(fInputTensorNames.begin(), fInputTensorNames.end(), name) == fInputTensorNames.end()) {
            fInputTensorNames.push_back(name);
        }
    }

    // Add intermediate tensor
    void AddIntermediateTensor(const std::string& name, ETensorType type, const std::vector<size_t>& shape) {
        TensorInfo info;
        info.type = type;
        info.shape = shape;
        fIntermediateTensorInfos[name] = info;
    }

    // Add output tensor names
    void AddOutputTensorNameList(const std::vector<std::string>& names) {
        fOutputTensorNames = names;
    }

    // Add operator to model
    void AddOperator(std::unique_ptr<ROperator> op) {
        op->Initialize(*this);
        fOperators.push_back(std::move(op));
    }

    // Initialize model
    void Initialize(int batchSize = -1) {
        fIsInitialized = true;
        if (Verbose()) {
            std::cout << "Model initialized with batch size: " <<
                (batchSize == -1 ? "default" : std::to_string(batchSize)) << std::endl;
        }
    }

    // Code generation (simplified placeholder)
    void Generate(Options options = Options::kDefault, int batchSize = -1) {
        if (!fIsInitialized) {
            Initialize(batchSize);
        }

        // Placeholder for actual generation logic
        if (Verbose()) {
            std::cout << "Generating code with options: " <<
                static_cast<int>(options) << " and batch size: " <<
                (batchSize == -1 ? "default" : std::to_string(batchSize)) << std::endl;
        }
    }
};

}}} // namespace TMVA::Experimental::SOFIE

#endif // TMVA_SOFIE_RMODEL

Writing /content/tmva_cuda_project/include/TMVA/RModel.hxx


In [None]:
%%writefile /content/tmva_cuda_project/include/TMVA/ROperator.hxx
#ifndef TMVA_SOFIE_ROPERATOR
#define TMVA_SOFIE_ROPERATOR

#include "TMVA/SOFIE_common.hxx"
#include <string>
#include <vector>

namespace TMVA {
namespace Experimental {
namespace SOFIE {

// Forward declaration
class RModel;

// Base class for all operators
class ROperator {
public:
    virtual ~ROperator() = default;
    virtual void Initialize(RModel& model) = 0;
    virtual std::string Generate(std::string OpName) = 0;

    // Common members
    std::vector<std::string> fInputTensorNames;
    std::vector<std::string> fOutputTensorNames;
};

}}} // namespace TMVA::Experimental::SOFIE

#endif // TMVA_SOFIE_ROPERATOR

Overwriting /content/tmva_cuda_project/include/TMVA/ROperator.hxx


In [11]:
%%writefile /content/tmva_cuda_project/include/TMVA/SOFIE_common.hxx
#ifndef TMVA_SOFIE_COMMON
#define TMVA_SOFIE_COMMON

#include <string>
#include <vector>
#include <memory>
#include <stdexcept>
#include <iostream>
#include <unordered_map>
#include <functional>
#include <algorithm>

namespace TMVA {
namespace Experimental {
namespace SOFIE {

// Basic tensor type enum (matching official implementation)
enum class ETensorType {
    UNDEFINED = 0, FLOAT = 1, UNINT8 = 2, INT8 = 3, UINT16 = 4, INT16 = 5,
    INT32 = 6, INT64 = 7, STRING = 8, BOOL = 9, FLOAT16 = 10, DOUBLE = 11,
    UINT32 = 12, UINT64 = 13, COMPLEX64 = 14, COMPLEX28 = 15, BFLOAT16 = 16
};

// Dimension structure for dynamic shapes
struct Dim {
    bool isParam = false;
    size_t dim = 0;
    std::string param;

    // Constructors
    Dim() {}
    Dim(const std::string& p, size_t d = 0) : isParam(true), dim(d), param(p) {}
    Dim(size_t d) : dim(d) {}

    std::string GetVal() const {
        return (isParam) ? param : std::to_string(dim);
    }
};

struct InputTensorInfo {
    ETensorType type;
    std::vector<Dim> shape;
};

struct TensorInfo {
    ETensorType type;
    std::vector<size_t> shape;
};

struct DynamicTensorInfo {
    ETensorType type;
    std::vector<Dim> shape;
};

// Helper functions - add inline to prevent multiple definition errors
inline size_t ConvertShapeToLength(const std::vector<size_t>& shape) {
    size_t length = 1;
    for (auto& dim : shape) {
        length *= dim;
    }
    return length;
}

inline std::string ConvertShapeToString(std::vector<size_t> shape) {
    std::string result = "{";
    for (size_t i = 0; i < shape.size(); i++) {
        result += std::to_string(shape[i]);
        if (i < shape.size() - 1) result += ", ";
    }
    result += "}";
    return result;
}

// Get string representation of type
template<typename T>
inline std::string GetTensorTypeName() {
    if (std::is_same<T, float>::value) return "float";
    if (std::is_same<T, double>::value) return "double";
    if (std::is_same<T, int64_t>::value) return "int64_t";
    if (std::is_same<T, int32_t>::value) return "int32_t";
    if (std::is_same<T, bool>::value) return "bool";
    return "unknown";
}

// Get ETensorType from C++ type
template<typename T>
ETensorType GetTemplatedType(T) {
    if (std::is_same<T, float>::value) return ETensorType::FLOAT;
    if (std::is_same<T, double>::value) return ETensorType::DOUBLE;
    if (std::is_same<T, int64_t>::value) return ETensorType::INT64;
    if (std::is_same<T, int32_t>::value) return ETensorType::INT32;
    if (std::is_same<T, bool>::value) return ETensorType::BOOL;
    throw std::runtime_error("Unsupported type in GetTemplatedType");
}

// Simple initialized tensor class - simplified version of the official one
class InitializedTensor {
public:
    InitializedTensor() = default;
    InitializedTensor(ETensorType type, const std::vector<size_t>& shape,
                     std::shared_ptr<void> data, bool constant = false)
        : fConstant(constant), fType(type), fShape(shape), fData(data) {}

    ETensorType const &type() const { return fType; }
    std::vector<std::size_t> const &shape() const { return fShape; }
    std::shared_ptr<void> const &sharedptr() const { return fData; }

    // Additional flags to match official behavior
    bool IsConstantTensor() const { return fConstant; }
    bool IsWeightTensor() const { return !fConstant && !fIsNotWritable; }
    void SetNotWritable() { fIsNotWritable = true; }

    template <class T = void>
    T const *data() const {
        return static_cast<T const *>(fData.get());
    }

private:
    bool fConstant = false;      // Flag for constant tensors
    bool fIsNotWritable = false; // Flag for not writable tensors
    ETensorType fType;
    std::vector<size_t> fShape;
    std::shared_ptr<void> fData;
};

// Utility namespace
namespace UTILITY {
    inline std::string Clean_name(const std::string& name) {
        return name; // Simplified for testing
    }

    // Check if two shapes are equal
    inline bool AreSameShape(const std::vector<size_t>& a, const std::vector<size_t>& b) {
        if (a.size() != b.size()) return false;
        for (size_t i = 0; i < a.size(); i++) {
            if (a[i] != b[i]) return false;
        }
        return true;
    }
}

}}} // namespace TMVA::Experimental::SOFIE

#endif // TMVA_SOFIE_COMMON

Writing /content/tmva_cuda_project/include/TMVA/SOFIE_common.hxx


In [None]:
%%writefile /content/tmva_cuda_project/include/TMVA/RModel.hxx
#ifndef TMVA_SOFIE_RMODEL
#define TMVA_SOFIE_RMODEL

#include "TMVA/SOFIE_common.hxx"
#include <unordered_map>
#include <string>
#include <vector>
#include <memory>
#include <map>
#include <algorithm>

namespace TMVA {
namespace Experimental {
namespace SOFIE {

// Forward declaration
class ROperator;

// Mock RModel class for our implementation
class RModel {
private:
    std::string fName;
    std::string fParsedDateTime;
    bool fIsInitialized = false;
    int fVerbose = 1;

    // Tensor storage
    std::unordered_map<std::string, ETensorType> fTensorTypes;
    std::unordered_map<std::string, std::vector<size_t>> fTensorShapes;
    std::vector<std::string> fOutputTensorNames;
    std::vector<std::string> fInputTensorNames;

public:
    RModel() = default;
    RModel(std::string name, std::string parsedtime) : fName(name), fParsedDateTime(parsedtime) {}

    int Verbose() const { return fVerbose; }

    const std::vector<size_t>& GetTensorShape(const std::string& name) {
        auto it = fTensorShapes.find(name);
        if (it != fTensorShapes.end()) {
            return it->second;
        }
        throw std::runtime_error("Tensor not found: " + name);
    }

    const ETensorType& GetTensorType(const std::string& name) {
        auto it = fTensorTypes.find(name);
        if (it != fTensorTypes.end()) {
            return it->second;
        }
        throw std::runtime_error("Tensor type not found: " + name);
    }

    bool CheckIfTensorAlreadyExist(const std::string& name) const {
        return fTensorShapes.find(name) != fTensorShapes.end();
    }

    // Add input tensor info
    void AddInputTensorInfo(const std::string& name, ETensorType type, const std::vector<size_t>& shape) {
        fTensorTypes[name] = type;
        fTensorShapes[name] = shape;

        // Also add to input tensor names if not already there
        if (std::find(fInputTensorNames.begin(), fInputTensorNames.end(), name) == fInputTensorNames.end()) {
            fInputTensorNames.push_back(name);
        }
    }

    // Add intermediate tensor
    void AddIntermediateTensor(const std::string& name, ETensorType type, const std::vector<size_t>& shape) {
        fTensorTypes[name] = type;
        fTensorShapes[name] = shape;
    }

    // Add output tensor names
    void AddOutputTensorNameList(const std::vector<std::string>& names) {
        fOutputTensorNames = names;
    }

    // Initialize model (simplified for mock)
    void Initialize(int batchSize = -1) {
        fIsInitialized = true;

        if (Verbose()) {
            std::cout << "Model initialized with batch size: " <<
                (batchSize == -1 ? "default" : std::to_string(batchSize)) << std::endl;
        }
    }
};

}}} // namespace TMVA::Experimental::SOFIE

#endif // TMVA_SOFIE_RMODEL

Overwriting /content/tmva_cuda_project/include/TMVA/RModel.hxx


In [12]:
!cd /content/tmva_cuda_project && cmake -B build && cmake --build build

-- The CUDA compiler identification is NVIDIA 12.5.82 with host compiler GNU 11.4.0
-- The CXX compiler identification is GNU 11.4.0
-- Detecting CUDA compiler ABI info
-- Detecting CUDA compiler ABI info - done
-- Check for working CUDA compiler: /usr/local/cuda/bin/nvcc - skipped
-- Detecting CUDA compile features
-- Detecting CUDA compile features - done
-- Detecting CXX compiler ABI info
-- Detecting CXX compiler ABI info - done
-- Check for working CXX compiler: /usr/bin/c++ - skipped
-- Detecting CXX compile features
-- Detecting CXX compile features - done
  Policy CMP0146 is not set: The FindCUDA module is removed.  Run "cmake
  --help-policy CMP0146" for policy details.  Use the cmake_policy command to

[0m
-- Performing Test CMAKE_HAVE_LIBC_PTHREAD
-- Performing Test CMAKE_HAVE_LIBC_PTHREAD - Success
-- Found Threads: TRUE
-- Found CUDA: /usr/local/cuda (found version "12.5")
-- Configuring done (4.2s)
-- Generating done (0.0s)
-- Build files have been written to: /content/t

In [13]:
!cd /content/tmva_cuda_project/build && ./test_elu_cuda

Testing TMVA SOFIE CUDA ELU Operator
Model initialized with batch size: default

---- Testing with alpha = 0.1 ----
ELU CUDA: input -> output (alpha=0.1)

Generated CUDA code (excerpt):
   float op_TestElu_alpha = 0.100000001;

//------ ELU CUDA
   // CUDA kernel for ELU operation
   __global__ void op_TestElu_elu_kernel(const float* input, float* output, size_t size, float alpha) {
      int idx = blockIdx.x * blockDim.x + threadId...

Input tensor:
-2.0000 -1.5000 -1.0000 
-0.5000 0.0000 0.5000 
1.0000 1.5000 2.0000 
-0.3000 -0.2000 -0.1000 
0.1000 0.2000 0.3000 

Expected output tensor (CPU ELU with alpha=0.1000):
-0.0865 -0.0777 -0.0632 
-0.0393 0.0000 0.5000 
1.0000 1.5000 2.0000 
-0.0259 -0.0181 -0.0095 
0.1000 0.2000 0.3000 

ELU behavior examples (alpha=0.1000):
For x = -1.0: ELU(x) = -0.0632
For x = 0.0: ELU(x) = 0.0000
For x = 1.0: ELU(x) = 1.0000

---- Testing with alpha = 1.0000 ----
ELU CUDA: input -> output (alpha=1.0000)

Generated CUDA code (excerpt):
   float op_TestEl