Eamon2009 · Eamon2009 · May 29, 2026 · May 26, 2026 · May 26, 2026 · May 26, 2026
diff --git a/.github/workflows/docker-publish.yml b/.github/workflows/docker-publish.yml
@@ -1,48 +1,40 @@
 name: Publish Docker image
 on:
-  push:
-    branches:
-      - master
-    tags:
-      - "v*.*.*"
-    paths-ignore:
-      - 'cuda/**'
-      - 'docs/**'
-      - '**.md'
-  pull_request:
-    branches:
-      - master
-    paths-ignore:
-      - 'cuda/**'
-      - 'docs/**'
-      - '**.md'
-
+  workflow_dispatch:      
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
 env:
   REGISTRY: ghcr.io
-
 jobs:
   build-and-push:
-    name: Build & push to ghcr.io
+    name: Build & push (${{ matrix.variant }})
     runs-on: ubuntu-latest
-
     permissions:
       contents: read
       packages: write
-
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+          - variant: cpu
+            dockerfile: Dockerfile
+            tag_suffix: ""
+          - variant: cuda
+            dockerfile: Dockerfile.cuda
+            tag_suffix: "-cuda"
     steps:
       - name: Checkout repository
         uses: actions/checkout@v4
       - name: Set lowercase image name
         id: image
         run: |
           echo "name=$(echo '${{ github.repository }}' | tr '[:upper:]' '[:lower:]')" >> $GITHUB_OUTPUT
-
       - name: Set up QEMU
         uses: docker/setup-qemu-action@v3
       - name: Set up Docker Buildx
         uses: docker/setup-buildx-action@v3
       - name: Log in to ghcr.io
-        if: github.event_name != 'pull_request'
         uses: docker/login-action@v3
         with:
           registry: ${{ env.REGISTRY }}
@@ -54,29 +46,28 @@ jobs:
         with:
           images: ${{ env.REGISTRY }}/${{ steps.image.outputs.name }}
           tags: |
-            type=raw,value=latest,enable={{is_default_branch}}
-            type=semver,pattern={{version}}
-            type=semver,pattern={{major}}.{{minor}}
-            type=ref,event=pr
-      - name: Build and push Docker image (CPU)
+            type=raw,value=latest${{ matrix.tag_suffix }},enable={{is_default_branch}}
+            type=semver,pattern={{version}},suffix=${{ matrix.tag_suffix }}
+            type=semver,pattern={{major}}.{{minor}},suffix=${{ matrix.tag_suffix }}
+            type=ref,event=pr,suffix=${{ matrix.tag_suffix }}
+      - name: Free disk space
+        if: matrix.variant == 'cuda'
+        run: |
+          sudo rm -rf /usr/share/dotnet
+          sudo rm -rf /opt/ghc
+          sudo rm -rf /usr/local/share/boost
+          df -h
+      - name: Build and push Docker image
         uses: docker/build-push-action@v6
         with:
           context: .
-          file: ./Dockerfile
-          push: ${{ github.event_name != 'pull_request' }}
+          file: ./${{ matrix.dockerfile }}
+          push: true
           tags: ${{ steps.meta.outputs.tags }}
           labels: ${{ steps.meta.outputs.labels }}
-          build-args: |
-            BASE_IMAGE=ubuntu:24.04
-          cache-from: type=gha
-          cache-to: type=gha,mode=max
+          cache-from: type=gha,scope=${{ matrix.variant }}
+          cache-to: type=gha,mode=max,scope=${{ matrix.variant }}
       - name: Image published
-        if: github.event_name != 'pull_request'
         run: |
-          echo "Image published to GitHub Packages"
-          echo ""
-          echo "Pull with:"
-          echo "  docker pull ${{ env.REGISTRY }}/${{ steps.image.outputs.name }}:latest"
-          echo ""
-          echo "Or via docker-compose:"
-          echo "  image: ${{ env.REGISTRY }}/${{ steps.image.outputs.name }}:latest"
+          echo "[${{ matrix.variant }}] published:"
+          echo "  docker pull ${{ env.REGISTRY }}/${{ steps.image.outputs.name }}:latest${{ matrix.tag_suffix }}"
diff --git a/cuda/KERNAL/adamw.cu → CUDA/KERNAL/adamw.cu b/cuda/KERNAL/adamw.cu → CUDA/KERNAL/adamw.cu
diff --git a/cuda/includes/adamw.cuh → CUDA/includes/adamw.cuh b/cuda/includes/adamw.cuh → CUDA/includes/adamw.cuh
diff --git a/CUDA/llmcpp/cuda_common.h b/CUDA/llmcpp/cuda_common.h
@@ -0,0 +1,209 @@
+/*
+Common utilities for CUDA code.
+*/
+#ifndef CUDA_COMMON_H
+#define CUDA_COMMON_H
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <math.h>
+#include <string>
+#include <type_traits>      // std::bool_constant
+#include <cuda_runtime.h>
+#include <nvtx3/nvToolsExt.h>
+#include <nvtx3/nvToolsExtCudaRt.h>
+#include <cuda_profiler_api.h>
+#include <cuda_bf16.h>
+#include <cuda_fp16.h>
+
+#include "utils.h"
+
+// ----------------------------------------------------------------------------
+// Global defines and settings
+
+// Device properties of the CUDA device used in this process
+// defined as extern here because the individual kernels wish to use it
+// but it is actually created and instantiated in the main program file
+extern cudaDeviceProp deviceProp;
+
+// WarpSize is not a compile time constant
+// Defining here like this possibly allows the compiler to optimize better
+#define WARP_SIZE 32U
+
+// try to make sure that 2 blocks fit on A100/H100 to maximise latency tolerance
+// this needs to be defines rather than queried to be used for __launch_bounds__
+#if __CUDA_ARCH__ == 800 || __CUDA_ARCH__ >= 900
+#define MAX_1024_THREADS_BLOCKS 2
+#else
+#define MAX_1024_THREADS_BLOCKS 1
+#endif
+
+// convenience macro for calculating grid/block dimensions for kernels
+#define CEIL_DIV(M, N) (((M) + (N)-1) / (N))
+
+// short-cuts for compile-time boolean values that can be used as function arguments
+constexpr std::bool_constant<true> True;
+constexpr std::bool_constant<true> False;
+
+// ----------------------------------------------------------------------------
+// Error checking
+
+// CUDA error checking. Underscore added so this function can be called directly not just via macro
+inline void cudaCheck_(cudaError_t error, const char *file, int line) {
+  if (error != cudaSuccess) {
+    printf("[CUDA ERROR] at file %s:%d:\n%s\n", file, line, cudaGetErrorString(error));
+    exit(EXIT_FAILURE);
+  }
+};
+#define cudaCheck(err) (cudaCheck_(err, __FILE__, __LINE__))
+
+// like cudaFree, but checks for errors _and_ resets the pointer.
+template<class T>
+inline void cudaFreeCheck(T** ptr, const char *file, int line) {
+    cudaError_t error = cudaFree(*ptr);
+    if (error != cudaSuccess) {
+        printf("[CUDA ERROR] at file %s:%d:\n%s\n", file, line, cudaGetErrorString(error));
+        exit(EXIT_FAILURE);
+    }
+    *ptr = nullptr;
+}
+#define cudaFreeCheck(ptr) (cudaFreeCheck(ptr, __FILE__, __LINE__))
+
+// ----------------------------------------------------------------------------
+// CUDA Precision settings and defines
+
+enum PrecisionMode {
+    PRECISION_FP32,
+    PRECISION_FP16,
+    PRECISION_BF16
+};
+
+// Specific configurations based on the enabled precision
+#if defined(ENABLE_FP32)
+typedef float floatX;
+#define PRECISION_MODE PRECISION_FP32
+// use fp16 (note: this may require gradient scaler, currently not implemented!)
+#elif defined(ENABLE_FP16)
+typedef half floatX;
+#define PRECISION_MODE PRECISION_FP16
+#else // Default to bfloat16
+typedef __nv_bfloat16 floatX;
+#define PRECISION_MODE PRECISION_BF16
+#endif
+
+// ----------------------------------------------------------------------------
+// Load and store with streaming cache hints
+// Older nvcc does not provide __ldcs and __stcs for bfloat16, despite these
+// actually just being unsigned shorts. We need to be careful here to only define
+// our own versions if none already exist, otherwise the compiler will complain.
+// If not, you easily get "no viable overload" (for sm52) and "function already exists" (sm_80)
+
+#if defined(ENABLE_BF16) && (__CUDACC_VER_MAJOR__ < 12) && !((__CUDA_ARCH__ >= 800) || !defined(__CUDA_ARCH__))
+__device__ floatX __ldcs(const floatX* address) {
+    unsigned short bf = __ldcs(reinterpret_cast<const unsigned short*>(address));
+    return __nv_bfloat16_raw{bf};
+}
+
+__device__ void __stcs(floatX* address, floatX value) {
+    __stcs(reinterpret_cast<unsigned short*>(address), ((__nv_bfloat16_raw)value).x);
+}
+#endif
+
+// ----------------------------------------------------------------------------
+// Profiler utils
+
+class NvtxRange {
+ public:
+    NvtxRange(const char* s) { nvtxRangePush(s); }
+    NvtxRange(const std::string& base_str, int number) {
+        std::string range_string = base_str + " " + std::to_string(number);
+        nvtxRangePush(range_string.c_str());
+    }
+    ~NvtxRange() { nvtxRangePop(); }
+};
+#define NVTX_RANGE_FN() NvtxRange nvtx_range(__FUNCTION__)
+
+// ----------------------------------------------------------------------------
+// Utilities to Read & Write between CUDA memory <-> files
+
+// copy num_bytes from device pointer src into file dest, using double buffering running on the given stream.
+inline void device_to_file(FILE* dest, void* src, size_t num_bytes, size_t buffer_size, cudaStream_t stream) {
+    // allocate pinned buffer for faster, async transfer
+    char* buffer_space;
+    cudaCheck(cudaMallocHost(&buffer_space, 2*buffer_size));
+    // split allocation in two
+    void* read_buffer = buffer_space;
+    void* write_buffer = buffer_space + buffer_size;
+
+    // prime the read buffer; first copy means we have to wait
+    char* gpu_read_ptr = (char*)src;
+    size_t copy_amount = std::min(buffer_size, num_bytes);
+    cudaCheck(cudaMemcpyAsync(read_buffer, gpu_read_ptr, copy_amount, cudaMemcpyDeviceToHost, stream));
+    cudaCheck(cudaStreamSynchronize(stream));
+    size_t rest_bytes = num_bytes - copy_amount;
+    size_t write_buffer_size = copy_amount;
+    gpu_read_ptr += copy_amount;
+
+    std::swap(read_buffer, write_buffer);
+    // now the main loop; as long as there are bytes left
+    while(rest_bytes > 0) {
+        // initiate next read
+        copy_amount = std::min(buffer_size, rest_bytes);
+        cudaCheck(cudaMemcpyAsync(read_buffer, gpu_read_ptr, copy_amount, cudaMemcpyDeviceToHost, stream));
+        // while this is going on, transfer the write buffer to disk
+        fwriteCheck(write_buffer, 1, write_buffer_size, dest);
+        cudaCheck(cudaStreamSynchronize(stream));     // wait for both buffers to be ready.
+
+        std::swap(read_buffer, write_buffer);
+        rest_bytes -= copy_amount;
+        write_buffer_size = copy_amount;
+        gpu_read_ptr += copy_amount;
+    }
+
+    // make sure to write the last remaining write buffer
+    fwriteCheck(write_buffer, 1, write_buffer_size, dest);
+    cudaCheck(cudaFreeHost(buffer_space));
+}
+
+// copy num_bytes from file src into device pointer dest, using double buffering running on the given stream.
+inline void file_to_device(void* dest, FILE* src, size_t num_bytes, size_t buffer_size, cudaStream_t stream) {
+     // allocate pinned buffer for faster, async transfer
+     // from the docs (https://developer.download.nvidia.com/compute/DevZone/docs/html/C/doc/html/group__CUDART__HIGHLEVEL_ge439496de696b166ba457dab5dd4f356.html)
+     // WC memory is a good option for buffers that will be written by the CPU and read by the device via mapped pinned memory or host->device transfers.
+    char* buffer_space;
+    cudaCheck(cudaMallocHost(&buffer_space, 2*buffer_size, cudaHostAllocWriteCombined));
+    // split allocation in two
+    void* read_buffer = buffer_space;
+    void* write_buffer = buffer_space + buffer_size;
+
+    // prime the read buffer;
+    char* gpu_write_ptr = (char*)dest;
+    size_t copy_amount = std::min(buffer_size, num_bytes);
+    freadCheck(read_buffer, 1, copy_amount, src);
+
+    size_t rest_bytes = num_bytes - copy_amount;
+    size_t write_buffer_size = copy_amount;
+    std::swap(read_buffer, write_buffer);
+
+    // now the main loop; as long as there are bytes left
+    while(rest_bytes > 0) {
+        // initiate next read
+        copy_amount = std::min(buffer_size, rest_bytes);
+        cudaCheck(cudaMemcpyAsync(gpu_write_ptr, write_buffer, write_buffer_size, cudaMemcpyHostToDevice, stream));
+        gpu_write_ptr += write_buffer_size;
+        // while this is going on, read from disk
+        freadCheck(read_buffer, 1, copy_amount, src);
+        cudaCheck(cudaStreamSynchronize(stream));     // wait for both buffers to be ready.
+
+        std::swap(read_buffer, write_buffer);
+        rest_bytes -= copy_amount;
+        write_buffer_size = copy_amount;
+    }
+
+    // copy the last remaining write buffer to gpu
+    cudaCheck(cudaMemcpyAsync(gpu_write_ptr, write_buffer, write_buffer_size, cudaMemcpyHostToDevice, stream));
+    cudaCheck(cudaStreamSynchronize(stream));
+    cudaCheck(cudaFreeHost(buffer_space));
+}
+
+#endif // CUDA_COMMON_H