diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 2b3f0e3..2bd42e2 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -75,27 +75,27 @@ jobs: include: - os: ubuntu-latest target: x86_64-linux-gnu - artifact_name: zigformer-cli-linux-x86_64.zip + artifact_name: zigformer-linux-x86_64.zip asset_content_type: application/zip - os: ubuntu-latest target: aarch64-linux-gnu - artifact_name: zigformer-cli-linux-aarch64.zip + artifact_name: zigformer-linux-aarch64.zip asset_content_type: application/zip - os: macos-latest target: aarch64-macos # Apple Silicon (ARM64) - artifact_name: zigformer-cli-macos-aarch64.zip + artifact_name: zigformer-macos-aarch64.zip asset_content_type: application/zip - os: macos-latest target: x86_64-macos # Intel - artifact_name: zigformer-cli-macos-x86_64.zip + artifact_name: zigformer-macos-x86_64.zip asset_content_type: application/zip - os: windows-latest target: x86_64-windows-msvc - artifact_name: zigformer-cli-windows-x86_64.zip + artifact_name: zigformer-windows-x86_64.zip asset_content_type: application/zip - os: windows-latest target: aarch64-windows-msvc - artifact_name: zigformer-cli-windows-aarch64.zip + artifact_name: zigformer-windows-aarch64.zip asset_content_type: application/zip steps: @@ -120,6 +120,11 @@ jobs: ls -l zig-out/bin || true exit 1 fi + if [ ! -f zig-out/bin/zigformer-gui ]; then + echo "ERROR: built binary zig-out/bin/zigformer-gui not found" + ls -l zig-out/bin || true + exit 1 + fi - name: Verify built binary exists (Windows) if: runner.os == 'Windows' @@ -129,6 +134,11 @@ jobs: dir zig-out\\bin exit 1 } + if (!(Test-Path "zig-out\\bin\\zigformer-gui.exe")) { + Write-Host 'ERROR: built binary zig-out\\bin\\zigformer-gui.exe not found' + dir zig-out\\bin + exit 1 + } - name: Prepare artifact for Linux/macOS if: runner.os != 'Windows' @@ -136,6 +146,7 @@ jobs: RELEASE_DIR="release-${{ matrix.target }}-${{ github.run_id }}" mkdir -p "$RELEASE_DIR" mv zig-out/bin/zigformer-cli "$RELEASE_DIR"/ + mv zig-out/bin/zigformer-gui "$RELEASE_DIR"/ # Create a zip of the release contents (avoid nesting an extra top-level directory) (cd "$RELEASE_DIR" && zip -r "../${{ matrix.artifact_name }}" .) @@ -145,6 +156,7 @@ jobs: $env:RELEASE_DIR = "release-${{ matrix.target }}-${{ github.run_id }}" New-Item -ItemType Directory -Path $env:RELEASE_DIR -Force | Out-Null Move-Item zig-out\bin\zigformer-cli.exe $env:RELEASE_DIR\ + Move-Item zig-out\bin\zigformer-gui.exe $env:RELEASE_DIR\ Compress-Archive -Path "$env:RELEASE_DIR\*" -DestinationPath ${{ matrix.artifact_name }} - name: Upload Release Asset diff --git a/README.md b/README.md index 8aeb5f4..ec6760c 100644 --- a/README.md +++ b/README.md @@ -28,7 +28,7 @@ and [nanochat](https://github.com/karpathy/nanochat) projects, and follows the a ["Language Models are Unsupervised Multitask Learners"](https://cdn.openai.com/better-language-models/language_models_are_unsupervised_multitask_learners.pdf) papers. It can be used as a [Zig library](https://CogitatorTech.github.io/zigformer/) for building LLMs or as a -[standalone application](https://github.com/CogitatorTech/zigformer/releases) +[standalone application](https://github.com/CogitatorTech/zigformer/releases/latest) for training, inference, and chatting with the model. The diagrams below show the high-level architecture and its core components. @@ -171,7 +171,7 @@ zig build run -- predict --prompt "How do mountains form?" --top-k 5 --load-mode zig build run-gui -- --load-model model.bin ``` -![ZigFormer Web UI](assets/screenshots/zigformer_webui_v0.1.0.jpeg) +![ZigFormer Web UI](assets/screenshots/zigformer_webui_v0.1.1.jpeg) --- diff --git a/assets/screenshots/zigformer_webui_v0.1.0.jpeg b/assets/screenshots/zigformer_webui_v0.1.0.jpeg deleted file mode 100644 index 5a7acc0..0000000 Binary files a/assets/screenshots/zigformer_webui_v0.1.0.jpeg and /dev/null differ diff --git a/assets/screenshots/zigformer_webui_v0.1.1.jpeg b/assets/screenshots/zigformer_webui_v0.1.1.jpeg new file mode 100644 index 0000000..a55bb4b Binary files /dev/null and b/assets/screenshots/zigformer_webui_v0.1.1.jpeg differ diff --git a/build.zig.zon b/build.zig.zon index 0f7a078..1834553 100644 --- a/build.zig.zon +++ b/build.zig.zon @@ -1,6 +1,6 @@ .{ .name = .zigformer, - .version = "0.1.0", + .version = "0.1.1", .fingerprint = 0xe046275379ddc4d8, // Changing this has security and trust implications. .minimum_zig_version = "0.15.2", .dependencies = .{ diff --git a/src/cli.zig b/src/cli.zig index 7d0efcd..21e8aee 100644 --- a/src/cli.zig +++ b/src/cli.zig @@ -216,7 +216,7 @@ fn trainAndMaybeRepl(allocator: std.mem.Allocator, pretrain_path: []const u8, ch std.debug.print("Type a prompt and press Enter to generate text.\n", .{}); std.debug.print("Type 'exit' to quit.\n", .{}); - const stdin_file = std.fs.File{ .handle = std.posix.STDIN_FILENO }; + const stdin_file = std.fs.File.stdin(); const stdin = stdin_file.deprecatedReader(); var buffer: [1024]u8 = undefined; while (true) { @@ -245,33 +245,6 @@ fn execRoot(ctx: chilli.CommandContext) !void { var config = if (config_path.len > 0) try loadConfig(allocator, config_path) else Config{}; defer config.deinit(allocator); - // Override with CLI flags if provided (checking if they are different from defaults is tricky with chilli, - // so we'll assume CLI flags take precedence if they are set to non-default values or if we just use them directly. - // Actually, a better approach is: use config as base, then overwrite with CLI flags. - // But chilli returns defaults if flag is missing. - // So we need to know if flag was actually passed. Chilli doesn't easily expose this. - // For now, let's just use CLI flags if config is NOT present, OR if we want to support overrides, - // we have to accept that CLI defaults might overwrite config values. - // To solve this properly: - // 1. Load config - // 2. For each field, check if CLI flag was passed (not easy with current chilli usage). - // Alternative: We only use config if --config is passed, and ignore other flags? No, overrides are good. - // Let's assume: Config file sets defaults. CLI flags override. - // But chilli returns default values if flag is missing. - // So if config has batch_size=64, and CLI default is 32, and user runs without --batch-size, chilli returns 32. - // If we overwrite config with 32, we lose the config value. - // We need to check if the flag was present. - // Since we can't easily do that, let's prioritize CLI flags ONLY if they are explicitly different from our hardcoded defaults? - // Or simpler: If --config is passed, we use it. We can manually parse args to see if flags are present, but that's messy. - - // Let's stick to the plan: Config file sets values. CLI flags override. - // If we want CLI to override, we need to know if user typed it. - // Given the constraints, let's do this: - // If --config is present, use it. - // AND we will NOT read other flags if --config is present, to avoid confusion. - // OR we can say: CLI flags are ignored if --config is present, EXCEPT for interactive/save-model maybe? - // Let's go with: If --config is present, it is the source of truth. - if (config_path.len > 0) { std.debug.print("Loaded configuration from {s}\n", .{config_path}); } else { @@ -361,7 +334,7 @@ pub fn main() anyerror!void { var root_cmd = try chilli.Command.init(allocator, .{ .name = "zigformer-cli", .description = "An educational transformer-based LLM in Zig", - .version = "v0.1.0", + .version = "v0.1.1", .exec = execRoot, }); defer root_cmd.deinit(); diff --git a/src/gui.zig b/src/gui.zig index 548b0c4..3818f43 100644 --- a/src/gui.zig +++ b/src/gui.zig @@ -27,7 +27,7 @@ const Config = struct { } }; -// Global state for the server +// Global state holders for the server const ServerState = struct { allocator: std.mem.Allocator, model: *llm.LLM, @@ -421,7 +421,7 @@ pub fn main() !void { var cmd = try chilli.Command.init(allocator, .{ .name = "zigformer-gui", .description = "Web GUI for ZigFormer", - .version = "0.1.0", + .version = "0.1.1", .exec = execGui, }); defer cmd.deinit(); diff --git a/src/gui/index.html b/src/gui/index.html index 6daf374..e504e62 100644 --- a/src/gui/index.html +++ b/src/gui/index.html @@ -381,7 +381,7 @@
diff --git a/src/lib.zig b/src/lib.zig index 5b82665..c886721 100644 --- a/src/lib.zig +++ b/src/lib.zig @@ -1,6 +1,6 @@ //! ## ZigFormer //! -//! ZigFormer is an implementation of a transformer-based language model (like GPT-2) +//! ZigFormer is an implementation of a transformer-based language model (LLM) //! written in pure Zig. It provides a framework for understanding and experimenting with //! transformer architectures. //! @@ -40,7 +40,7 @@ //! - `self_attention`: Multi-head self-attention mechanism //! - `feed_forward`: Position-wise feed-forward networks //! - `layer_norm`: Layer normalization -//! - `transformer`: Transformer block (attention and FFN) +//! - `transformer`: Transformer block (feed-forward and attention layers) //! - `output_projection`: Output layer for vocabulary prediction //! - `llm`: Language model implementation with training and inference API //! - `optimizer`: Adam optimizer diff --git a/src/zigformer/llm.zig b/src/zigformer/llm.zig index e65ce27..ccab385 100644 --- a/src/zigformer/llm.zig +++ b/src/zigformer/llm.zig @@ -674,24 +674,6 @@ pub const LLM = struct { } pub fn setBatchSize(self: *LLM, batch_size: usize) void { - // Iterate over layers and set batch_size for SelfAttention layers - // Note: We need to know which layers are SelfAttention. - // In our simple structure, we know layers 1, 2, 3 are TransformerBlocks. - // TransformerBlock contains SelfAttention. - // But Layer is type-erased. - // Ideally, we should add setBatchSize to Layer vtable, but that's a big change. - // For now, we'll rely on the known structure and pointer casting, which is risky but fits the current style. - // Actually, TransformerBlock has a setBatchSize method we should add. - // Let's assume we add setBatchSize to TransformerBlock and call it here. - - // Wait, we can't easily cast opaque pointers back to types without RTTI or knowing the type. - // Given the fixed structure: - // Layer 0: Embeddings (no batch_size needed) - // Layer 1: TransformerBlock - // Layer 2: TransformerBlock - // Layer 3: TransformerBlock - // Layer 4: OutputProjection (no batch_size needed) - const embeddings: *Embeddings = @ptrCast(@alignCast(self.network.items[0].self)); embeddings.setBatchSize(batch_size); diff --git a/src/zigformer/self_attention.zig b/src/zigformer/self_attention.zig index 4ea14df..8a8cd15 100644 --- a/src/zigformer/self_attention.zig +++ b/src/zigformer/self_attention.zig @@ -635,13 +635,6 @@ test "SelfAttention Causal Masking with Cache" { var output = try attn.forward(input, true); defer output.deinit(); - // Check cached_attention_scores - // Should be (num_heads * seq_len) x seq_len - // For each head, the 4x4 score matrix should be lower triangular (masked). - // scores[i, j] should be -inf (or very small after softmax, but we check pre-softmax scores if possible? - // Wait, cached_attention_scores stores POST-softmax scores. - // So masked values should be 0.0. - const scores = attn.cached_attention_scores; // Rows: num_heads * seq_len // Cols: seq_len (since cache_len = seq_len after first pass)