Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
71 commits
Select commit Hold shift + click to select a range
13e096f
feat(cuda): add CheckpointMetadata struct and checkpoint stub functions
Eamon2009 May 26, 2026
d2ca170
feat(cuda): add TokenBatchView and DataLoader stubs
Eamon2009 May 26, 2026
ee7668d
feat(cuda): add GELU forward and backward activation kernels declaration
Eamon2009 May 26, 2026
9c4485a
feat(cuda): add gradient clipping and global norm kernel declarations
Eamon2009 May 26, 2026
6dc4f34
feat(cuda): add LayerNorm forward and backward kernel declarations
Eamon2009 May 26, 2026
9af9eec
feat(cuda): add internal logging utility Introduce LogLevel enum sup…
Eamon2009 May 26, 2026
8904031
feat(cuda): add cuBLAS wrapper and GEMM forward/backward interfaces
Eamon2009 May 26, 2026
9a8f4da
feat(cuda): add NCCL communicator wrapper and collective reduction stubs
Eamon2009 May 26, 2026
174885c
feat(cuda): add cosine learning rate decay schedule helper
Eamon2009 May 26, 2026
2ff152f
feat(cuda): add Zero-DP sharding range and tensor view utilities
Eamon2009 May 26, 2026
b2abad5
refactor: remove dead code and clean up unused logic
Eamon2009 May 26, 2026
49aa315
refactor: remove dead code and clean up unused logic
Eamon2009 May 26, 2026
c5cbcce
refactor: remove dead code and clean up unused logic
Eamon2009 May 26, 2026
de3f8da
refactor: remove dead code and clean up unused logic
Eamon2009 May 26, 2026
a3f680e
feat(cuda): implement backward pass for attention mechanism
Eamon2009 May 26, 2026
8ccaee6
feat(cuda): implement forward pass for LayerNorm kernel
Eamon2009 May 26, 2026
2cc9a9e
feat(cuda): implement LayerNorm backward pass kernels
Eamon2009 May 26, 2026
7cc6c72
feat(cuda): implement backward pass for GELU activation kernels
Eamon2009 May 26, 2026
047d27e
feat(cuda): implement forward pass for GELU activation kernels
Eamon2009 May 26, 2026
1ba8ec0
test(cuda): add comprehensive validation script for all header files
Eamon2009 May 26, 2026
f8f3316
test(cuda): add comprehensive validation script for all header files
Eamon2009 May 26, 2026
7ad1425
test(cuda): add comprehensive validation script for all header files
Eamon2009 May 26, 2026
2fb6905
feat(cuda): add QKV permutation and unpermutation kernels
Eamon2009 May 26, 2026
1fa2f23
feat(cuda): add forward pass declaration for causal masking
Eamon2009 May 26, 2026
e88b5e5
feat(cuda): add forward pass declarations for softmax and causal softmax
Eamon2009 May 26, 2026
38bd551
feat(cuda): implement causal_mask_forward kernel and host wrapper
Eamon2009 May 26, 2026
b350deb
feat(cuda): implement matmul backward passes for input and weight gra…
Eamon2009 May 26, 2026
fc28976
feat(main): integrate and orchestrate CUDA kernels in main entrypoint
Eamon2009 May 26, 2026
e9f9820
style(assets): update project icon to new SVG logo
Eamon2009 May 26, 2026
02154d4
Delete quadtrix_training_report.png
Eamon2009 May 27, 2026
4996e02
Delete Quadtrix_Educational_Paper.pdf
Eamon2009 May 27, 2026
a1e9919
docs: document LLM architecture and system design
Eamon2009 May 27, 2026
c377b15
docs: document LLM architecture and system design
Eamon2009 May 27, 2026
478d4b4
ci: add GitHub Actions workflow to build and push Docker images
Eamon2009 May 27, 2026
64197d2
ci: add GitHub Actions workflow to build and push Docker images
Eamon2009 May 27, 2026
6fed8cc
ci: add GitHub Actions workflow to build and push Docker images
Eamon2009 May 27, 2026
e1ff117
ci: add GitHub Actions workflow to build and push Docker images
Eamon2009 May 27, 2026
b79a9d9
ci: add GitHub Actions workflow to build and push Docker images
Eamon2009 May 27, 2026
1969df2
ci: add GitHub Actions workflow to build and push Docker images
Eamon2009 May 27, 2026
232401c
ci: add GitHub Actions workflow to build and push Docker images
Eamon2009 May 27, 2026
50c46bd
ci: add GitHub Actions workflow to build and push Docker images
Eamon2009 May 27, 2026
e090012
ci: add GitHub Actions workflow to build and push Docker images
Eamon2009 May 27, 2026
1f74aab
Refactor Dockerfile to use Ubuntu base image
Eamon2009 May 27, 2026
2f16cfa
Update Dockerfile to use NVIDIA CUDA base images
Eamon2009 May 27, 2026
f8ea13e
Modify Docker publish workflow to always push images
Eamon2009 May 27, 2026
df53f7b
feat(dataloader): add distributed DataLoader and initial shard loading
Eamon2009 May 27, 2026
377098c
feat(multi-gpu): add local GPU rank detection and config initialization
Eamon2009 May 27, 2026
33ac088
feat(cuda): implement optimized forward LayerNorm kernels
Eamon2009 May 27, 2026
cbb7d73
feat(cuda): implement fallback attention permutation and softmax kernels
Eamon2009 May 27, 2026
b76c5dc
feat(cuda): add Packed128 utility for vectorized 128-bit memory opera…
Eamon2009 May 27, 2026
eaa74dd
feat: add macro-wrapped I/O and socket error checking utilities
Eamon2009 May 27, 2026
2fb0e1b
feat: add cuBLAS setup and macro utilities with mixed-precision support
Eamon2009 May 28, 2026
484fa47
feat: implement AdamW optimizer CUDA kernel with mixed-precision and …
Eamon2009 May 28, 2026
dda5c50
feat: add comprehensive CUDA common utilities and precision configura…
Eamon2009 May 28, 2026
0f59dbd
Delete CUDA/llmcpp directory
Eamon2009 May 28, 2026
7251988
deleted
Eamon2009 May 28, 2026
9bd42ef
feat(cuda): add common cuda utilities header with error checking, pre…
Eamon2009 May 28, 2026
dde9024
Delete cuda directory
Eamon2009 May 28, 2026
d1f3d1a
feat(cuda): implement AdamW optimizer kernel and host interface
Eamon2009 May 28, 2026
a4f4d2b
feat(cuda): declare AdamW configuration and host interface
Eamon2009 May 28, 2026
c96ede9
style(theme): overhaul global design tokens and layout variables
Eamon2009 May 29, 2026
bb11b15
refactor(components): migrate Button from Tailwind classes to CSS-in-…
Eamon2009 May 29, 2026
4073236
refactor(components): migrate SessionItem to inline styles and add ho…
Eamon2009 May 29, 2026
5987e7f
refactor(components): migrate StatsPanel to inline styles and adjust …
Eamon2009 May 29, 2026
dcae80f
refactor(components): migrate SettingsPanel to inline styles and upda…
Eamon2009 May 29, 2026
873ba39
refactor(components): migrate ModelBadge to inline styles and add inl…
Eamon2009 May 29, 2026
65bcb62
refactor(components): migrate Topbar to inline styles and add mobile …
Eamon2009 May 29, 2026
afe9ead
refactor(components): migrate Sidebar to inline styles and adjust str…
Eamon2009 May 29, 2026
ee5134b
refactor(layout): migrate AppLayout shell to inline styles
Eamon2009 May 29, 2026
b22250f
refactor(components): migrate InputBar to inline styles and adjust be…
Eamon2009 May 29, 2026
f7214c6
Update README.md removing npm package description
Eamon2009 May 29, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
75 changes: 33 additions & 42 deletions .github/workflows/docker-publish.yml
Original file line number Diff line number Diff line change
@@ -1,48 +1,40 @@
name: Publish Docker image
on:
push:
branches:
- master
tags:
- "v*.*.*"
paths-ignore:
- 'cuda/**'
- 'docs/**'
- '**.md'
pull_request:
branches:
- master
paths-ignore:
- 'cuda/**'
- 'docs/**'
- '**.md'

workflow_dispatch:
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: true
env:
REGISTRY: ghcr.io

jobs:
build-and-push:
name: Build & push to ghcr.io
name: Build & push (${{ matrix.variant }})
runs-on: ubuntu-latest

permissions:
contents: read
packages: write

strategy:
fail-fast: false
matrix:
include:
- variant: cpu
dockerfile: Dockerfile
tag_suffix: ""
- variant: cuda
dockerfile: Dockerfile.cuda
tag_suffix: "-cuda"
steps:
- name: Checkout repository
uses: actions/checkout@v4
- name: Set lowercase image name
id: image
run: |
echo "name=$(echo '${{ github.repository }}' | tr '[:upper:]' '[:lower:]')" >> $GITHUB_OUTPUT

- name: Set up QEMU
uses: docker/setup-qemu-action@v3
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v3
- name: Log in to ghcr.io
if: github.event_name != 'pull_request'
uses: docker/login-action@v3
with:
registry: ${{ env.REGISTRY }}
Expand All @@ -54,29 +46,28 @@ jobs:
with:
images: ${{ env.REGISTRY }}/${{ steps.image.outputs.name }}
tags: |
type=raw,value=latest,enable={{is_default_branch}}
type=semver,pattern={{version}}
type=semver,pattern={{major}}.{{minor}}
type=ref,event=pr
- name: Build and push Docker image (CPU)
type=raw,value=latest${{ matrix.tag_suffix }},enable={{is_default_branch}}
type=semver,pattern={{version}},suffix=${{ matrix.tag_suffix }}
type=semver,pattern={{major}}.{{minor}},suffix=${{ matrix.tag_suffix }}
type=ref,event=pr,suffix=${{ matrix.tag_suffix }}
- name: Free disk space
if: matrix.variant == 'cuda'
run: |
sudo rm -rf /usr/share/dotnet
sudo rm -rf /opt/ghc
sudo rm -rf /usr/local/share/boost
df -h
- name: Build and push Docker image
uses: docker/build-push-action@v6
with:
context: .
file: ./Dockerfile
push: ${{ github.event_name != 'pull_request' }}
file: ./${{ matrix.dockerfile }}
push: true
tags: ${{ steps.meta.outputs.tags }}
labels: ${{ steps.meta.outputs.labels }}
build-args: |
BASE_IMAGE=ubuntu:24.04
cache-from: type=gha
cache-to: type=gha,mode=max
cache-from: type=gha,scope=${{ matrix.variant }}
cache-to: type=gha,mode=max,scope=${{ matrix.variant }}
- name: Image published
if: github.event_name != 'pull_request'
run: |
echo "Image published to GitHub Packages"
echo ""
echo "Pull with:"
echo " docker pull ${{ env.REGISTRY }}/${{ steps.image.outputs.name }}:latest"
echo ""
echo "Or via docker-compose:"
echo " image: ${{ env.REGISTRY }}/${{ steps.image.outputs.name }}:latest"
echo "[${{ matrix.variant }}] published:"
echo " docker pull ${{ env.REGISTRY }}/${{ steps.image.outputs.name }}:latest${{ matrix.tag_suffix }}"
File renamed without changes.
File renamed without changes.
209 changes: 209 additions & 0 deletions CUDA/llmcpp/cuda_common.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,209 @@
/*
Common utilities for CUDA code.
*/
#ifndef CUDA_COMMON_H
#define CUDA_COMMON_H

#include <stdlib.h>
#include <stdio.h>
#include <math.h>
#include <string>
#include <type_traits> // std::bool_constant
#include <cuda_runtime.h>
#include <nvtx3/nvToolsExt.h>
#include <nvtx3/nvToolsExtCudaRt.h>
#include <cuda_profiler_api.h>
#include <cuda_bf16.h>
#include <cuda_fp16.h>

#include "utils.h"

// ----------------------------------------------------------------------------
// Global defines and settings

// Device properties of the CUDA device used in this process
// defined as extern here because the individual kernels wish to use it
// but it is actually created and instantiated in the main program file
extern cudaDeviceProp deviceProp;

// WarpSize is not a compile time constant
// Defining here like this possibly allows the compiler to optimize better
#define WARP_SIZE 32U

// try to make sure that 2 blocks fit on A100/H100 to maximise latency tolerance
// this needs to be defines rather than queried to be used for __launch_bounds__
#if __CUDA_ARCH__ == 800 || __CUDA_ARCH__ >= 900
#define MAX_1024_THREADS_BLOCKS 2
#else
#define MAX_1024_THREADS_BLOCKS 1
#endif

// convenience macro for calculating grid/block dimensions for kernels
#define CEIL_DIV(M, N) (((M) + (N)-1) / (N))

// short-cuts for compile-time boolean values that can be used as function arguments
constexpr std::bool_constant<true> True;
constexpr std::bool_constant<true> False;

// ----------------------------------------------------------------------------
// Error checking

// CUDA error checking. Underscore added so this function can be called directly not just via macro
inline void cudaCheck_(cudaError_t error, const char *file, int line) {
if (error != cudaSuccess) {
printf("[CUDA ERROR] at file %s:%d:\n%s\n", file, line, cudaGetErrorString(error));
exit(EXIT_FAILURE);
}
};
#define cudaCheck(err) (cudaCheck_(err, __FILE__, __LINE__))

// like cudaFree, but checks for errors _and_ resets the pointer.
template<class T>
inline void cudaFreeCheck(T** ptr, const char *file, int line) {
cudaError_t error = cudaFree(*ptr);
if (error != cudaSuccess) {
printf("[CUDA ERROR] at file %s:%d:\n%s\n", file, line, cudaGetErrorString(error));
exit(EXIT_FAILURE);
}
*ptr = nullptr;
}
#define cudaFreeCheck(ptr) (cudaFreeCheck(ptr, __FILE__, __LINE__))

// ----------------------------------------------------------------------------
// CUDA Precision settings and defines

enum PrecisionMode {
PRECISION_FP32,
PRECISION_FP16,
PRECISION_BF16
};

// Specific configurations based on the enabled precision
#if defined(ENABLE_FP32)
typedef float floatX;
#define PRECISION_MODE PRECISION_FP32
// use fp16 (note: this may require gradient scaler, currently not implemented!)
#elif defined(ENABLE_FP16)
typedef half floatX;
#define PRECISION_MODE PRECISION_FP16
#else // Default to bfloat16
typedef __nv_bfloat16 floatX;
#define PRECISION_MODE PRECISION_BF16
#endif

// ----------------------------------------------------------------------------
// Load and store with streaming cache hints
// Older nvcc does not provide __ldcs and __stcs for bfloat16, despite these
// actually just being unsigned shorts. We need to be careful here to only define
// our own versions if none already exist, otherwise the compiler will complain.
// If not, you easily get "no viable overload" (for sm52) and "function already exists" (sm_80)

#if defined(ENABLE_BF16) && (__CUDACC_VER_MAJOR__ < 12) && !((__CUDA_ARCH__ >= 800) || !defined(__CUDA_ARCH__))
__device__ floatX __ldcs(const floatX* address) {
unsigned short bf = __ldcs(reinterpret_cast<const unsigned short*>(address));
return __nv_bfloat16_raw{bf};
}

__device__ void __stcs(floatX* address, floatX value) {
__stcs(reinterpret_cast<unsigned short*>(address), ((__nv_bfloat16_raw)value).x);
}
#endif

// ----------------------------------------------------------------------------
// Profiler utils

class NvtxRange {
public:
NvtxRange(const char* s) { nvtxRangePush(s); }
NvtxRange(const std::string& base_str, int number) {
std::string range_string = base_str + " " + std::to_string(number);
nvtxRangePush(range_string.c_str());
}
~NvtxRange() { nvtxRangePop(); }
};
#define NVTX_RANGE_FN() NvtxRange nvtx_range(__FUNCTION__)

// ----------------------------------------------------------------------------
// Utilities to Read & Write between CUDA memory <-> files

// copy num_bytes from device pointer src into file dest, using double buffering running on the given stream.
inline void device_to_file(FILE* dest, void* src, size_t num_bytes, size_t buffer_size, cudaStream_t stream) {
// allocate pinned buffer for faster, async transfer
char* buffer_space;
cudaCheck(cudaMallocHost(&buffer_space, 2*buffer_size));
// split allocation in two
void* read_buffer = buffer_space;
void* write_buffer = buffer_space + buffer_size;

// prime the read buffer; first copy means we have to wait
char* gpu_read_ptr = (char*)src;
size_t copy_amount = std::min(buffer_size, num_bytes);
cudaCheck(cudaMemcpyAsync(read_buffer, gpu_read_ptr, copy_amount, cudaMemcpyDeviceToHost, stream));
cudaCheck(cudaStreamSynchronize(stream));
size_t rest_bytes = num_bytes - copy_amount;
size_t write_buffer_size = copy_amount;
gpu_read_ptr += copy_amount;

std::swap(read_buffer, write_buffer);
// now the main loop; as long as there are bytes left
while(rest_bytes > 0) {
// initiate next read
copy_amount = std::min(buffer_size, rest_bytes);
cudaCheck(cudaMemcpyAsync(read_buffer, gpu_read_ptr, copy_amount, cudaMemcpyDeviceToHost, stream));
// while this is going on, transfer the write buffer to disk
fwriteCheck(write_buffer, 1, write_buffer_size, dest);
cudaCheck(cudaStreamSynchronize(stream)); // wait for both buffers to be ready.

std::swap(read_buffer, write_buffer);
rest_bytes -= copy_amount;
write_buffer_size = copy_amount;
gpu_read_ptr += copy_amount;
}

// make sure to write the last remaining write buffer
fwriteCheck(write_buffer, 1, write_buffer_size, dest);
cudaCheck(cudaFreeHost(buffer_space));
}

// copy num_bytes from file src into device pointer dest, using double buffering running on the given stream.
inline void file_to_device(void* dest, FILE* src, size_t num_bytes, size_t buffer_size, cudaStream_t stream) {
// allocate pinned buffer for faster, async transfer
// from the docs (https://developer.download.nvidia.com/compute/DevZone/docs/html/C/doc/html/group__CUDART__HIGHLEVEL_ge439496de696b166ba457dab5dd4f356.html)
// WC memory is a good option for buffers that will be written by the CPU and read by the device via mapped pinned memory or host->device transfers.
char* buffer_space;
cudaCheck(cudaMallocHost(&buffer_space, 2*buffer_size, cudaHostAllocWriteCombined));
// split allocation in two
void* read_buffer = buffer_space;
void* write_buffer = buffer_space + buffer_size;

// prime the read buffer;
char* gpu_write_ptr = (char*)dest;
size_t copy_amount = std::min(buffer_size, num_bytes);
freadCheck(read_buffer, 1, copy_amount, src);

size_t rest_bytes = num_bytes - copy_amount;
size_t write_buffer_size = copy_amount;
std::swap(read_buffer, write_buffer);

// now the main loop; as long as there are bytes left
while(rest_bytes > 0) {
// initiate next read
copy_amount = std::min(buffer_size, rest_bytes);
cudaCheck(cudaMemcpyAsync(gpu_write_ptr, write_buffer, write_buffer_size, cudaMemcpyHostToDevice, stream));
gpu_write_ptr += write_buffer_size;
// while this is going on, read from disk
freadCheck(read_buffer, 1, copy_amount, src);
cudaCheck(cudaStreamSynchronize(stream)); // wait for both buffers to be ready.

std::swap(read_buffer, write_buffer);
rest_bytes -= copy_amount;
write_buffer_size = copy_amount;
}

// copy the last remaining write buffer to gpu
cudaCheck(cudaMemcpyAsync(gpu_write_ptr, write_buffer, write_buffer_size, cudaMemcpyHostToDevice, stream));
cudaCheck(cudaStreamSynchronize(stream));
cudaCheck(cudaFreeHost(buffer_space));
}

#endif // CUDA_COMMON_H
Loading
Loading