Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 18 additions & 6 deletions .github/workflows/release.yml
Original file line number Diff line number Diff line change
Expand Up @@ -75,27 +75,27 @@ jobs:
include:
- os: ubuntu-latest
target: x86_64-linux-gnu
artifact_name: zigformer-cli-linux-x86_64.zip
artifact_name: zigformer-linux-x86_64.zip
asset_content_type: application/zip
- os: ubuntu-latest
target: aarch64-linux-gnu
artifact_name: zigformer-cli-linux-aarch64.zip
artifact_name: zigformer-linux-aarch64.zip
asset_content_type: application/zip
- os: macos-latest
target: aarch64-macos # Apple Silicon (ARM64)
artifact_name: zigformer-cli-macos-aarch64.zip
artifact_name: zigformer-macos-aarch64.zip
asset_content_type: application/zip
- os: macos-latest
target: x86_64-macos # Intel
artifact_name: zigformer-cli-macos-x86_64.zip
artifact_name: zigformer-macos-x86_64.zip
asset_content_type: application/zip
- os: windows-latest
target: x86_64-windows-msvc
artifact_name: zigformer-cli-windows-x86_64.zip
artifact_name: zigformer-windows-x86_64.zip
asset_content_type: application/zip
- os: windows-latest
target: aarch64-windows-msvc
artifact_name: zigformer-cli-windows-aarch64.zip
artifact_name: zigformer-windows-aarch64.zip
asset_content_type: application/zip

steps:
Expand All @@ -120,6 +120,11 @@ jobs:
ls -l zig-out/bin || true
exit 1
fi
if [ ! -f zig-out/bin/zigformer-gui ]; then
echo "ERROR: built binary zig-out/bin/zigformer-gui not found"
ls -l zig-out/bin || true
exit 1
fi

- name: Verify built binary exists (Windows)
if: runner.os == 'Windows'
Expand All @@ -129,13 +134,19 @@ jobs:
dir zig-out\\bin
exit 1
}
if (!(Test-Path "zig-out\\bin\\zigformer-gui.exe")) {
Write-Host 'ERROR: built binary zig-out\\bin\\zigformer-gui.exe not found'
dir zig-out\\bin
exit 1
}

- name: Prepare artifact for Linux/macOS
if: runner.os != 'Windows'
run: |
RELEASE_DIR="release-${{ matrix.target }}-${{ github.run_id }}"
mkdir -p "$RELEASE_DIR"
mv zig-out/bin/zigformer-cli "$RELEASE_DIR"/
mv zig-out/bin/zigformer-gui "$RELEASE_DIR"/
# Create a zip of the release contents (avoid nesting an extra top-level directory)
(cd "$RELEASE_DIR" && zip -r "../${{ matrix.artifact_name }}" .)

Expand All @@ -145,6 +156,7 @@ jobs:
$env:RELEASE_DIR = "release-${{ matrix.target }}-${{ github.run_id }}"
New-Item -ItemType Directory -Path $env:RELEASE_DIR -Force | Out-Null
Move-Item zig-out\bin\zigformer-cli.exe $env:RELEASE_DIR\
Move-Item zig-out\bin\zigformer-gui.exe $env:RELEASE_DIR\
Compress-Archive -Path "$env:RELEASE_DIR\*" -DestinationPath ${{ matrix.artifact_name }}

- name: Upload Release Asset
Expand Down
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ and [nanochat](https://github.com/karpathy/nanochat) projects, and follows the a
["Language Models are Unsupervised Multitask Learners"](https://cdn.openai.com/better-language-models/language_models_are_unsupervised_multitask_learners.pdf)
papers.
It can be used as a [Zig library](https://CogitatorTech.github.io/zigformer/) for building LLMs or as a
[standalone application](https://github.com/CogitatorTech/zigformer/releases)
[standalone application](https://github.com/CogitatorTech/zigformer/releases/latest)
for training, inference, and chatting with the model.

The diagrams below show the high-level architecture and its core components.
Expand Down Expand Up @@ -171,7 +171,7 @@ zig build run -- predict --prompt "How do mountains form?" --top-k 5 --load-mode
zig build run-gui -- --load-model model.bin
```

![ZigFormer Web UI](assets/screenshots/zigformer_webui_v0.1.0.jpeg)
![ZigFormer Web UI](assets/screenshots/zigformer_webui_v0.1.1.jpeg)

---

Expand Down
Binary file removed assets/screenshots/zigformer_webui_v0.1.0.jpeg
Binary file not shown.
Binary file added assets/screenshots/zigformer_webui_v0.1.1.jpeg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
2 changes: 1 addition & 1 deletion build.zig.zon
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
.{
.name = .zigformer,
.version = "0.1.0",
.version = "0.1.1",
.fingerprint = 0xe046275379ddc4d8, // Changing this has security and trust implications.
.minimum_zig_version = "0.15.2",
.dependencies = .{
Expand Down
31 changes: 2 additions & 29 deletions src/cli.zig
Original file line number Diff line number Diff line change
Expand Up @@ -216,7 +216,7 @@ fn trainAndMaybeRepl(allocator: std.mem.Allocator, pretrain_path: []const u8, ch
std.debug.print("Type a prompt and press Enter to generate text.\n", .{});
std.debug.print("Type 'exit' to quit.\n", .{});

const stdin_file = std.fs.File{ .handle = std.posix.STDIN_FILENO };
const stdin_file = std.fs.File.stdin();
const stdin = stdin_file.deprecatedReader();
var buffer: [1024]u8 = undefined;
while (true) {
Expand Down Expand Up @@ -245,33 +245,6 @@ fn execRoot(ctx: chilli.CommandContext) !void {
var config = if (config_path.len > 0) try loadConfig(allocator, config_path) else Config{};
defer config.deinit(allocator);

// Override with CLI flags if provided (checking if they are different from defaults is tricky with chilli,
// so we'll assume CLI flags take precedence if they are set to non-default values or if we just use them directly.
// Actually, a better approach is: use config as base, then overwrite with CLI flags.
// But chilli returns defaults if flag is missing.
// So we need to know if flag was actually passed. Chilli doesn't easily expose this.
// For now, let's just use CLI flags if config is NOT present, OR if we want to support overrides,
// we have to accept that CLI defaults might overwrite config values.
// To solve this properly:
// 1. Load config
// 2. For each field, check if CLI flag was passed (not easy with current chilli usage).
// Alternative: We only use config if --config is passed, and ignore other flags? No, overrides are good.
// Let's assume: Config file sets defaults. CLI flags override.
// But chilli returns default values if flag is missing.
// So if config has batch_size=64, and CLI default is 32, and user runs without --batch-size, chilli returns 32.
// If we overwrite config with 32, we lose the config value.
// We need to check if the flag was present.
// Since we can't easily do that, let's prioritize CLI flags ONLY if they are explicitly different from our hardcoded defaults?
// Or simpler: If --config is passed, we use it. We can manually parse args to see if flags are present, but that's messy.

// Let's stick to the plan: Config file sets values. CLI flags override.
// If we want CLI to override, we need to know if user typed it.
// Given the constraints, let's do this:
// If --config is present, use it.
// AND we will NOT read other flags if --config is present, to avoid confusion.
// OR we can say: CLI flags are ignored if --config is present, EXCEPT for interactive/save-model maybe?
// Let's go with: If --config is present, it is the source of truth.

if (config_path.len > 0) {
std.debug.print("Loaded configuration from {s}\n", .{config_path});
} else {
Expand Down Expand Up @@ -361,7 +334,7 @@ pub fn main() anyerror!void {
var root_cmd = try chilli.Command.init(allocator, .{
.name = "zigformer-cli",
.description = "An educational transformer-based LLM in Zig",
.version = "v0.1.0",
.version = "v0.1.1",
.exec = execRoot,
});
defer root_cmd.deinit();
Expand Down
4 changes: 2 additions & 2 deletions src/gui.zig
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ const Config = struct {
}
};

// Global state for the server
// Global state holders for the server
const ServerState = struct {
allocator: std.mem.Allocator,
model: *llm.LLM,
Expand Down Expand Up @@ -421,7 +421,7 @@ pub fn main() !void {
var cmd = try chilli.Command.init(allocator, .{
.name = "zigformer-gui",
.description = "Web GUI for ZigFormer",
.version = "0.1.0",
.version = "0.1.1",
.exec = execGui,
});
defer cmd.deinit();
Expand Down
2 changes: 1 addition & 1 deletion src/gui/index.html
Original file line number Diff line number Diff line change
Expand Up @@ -381,7 +381,7 @@

<div class="control">
<label>
Top-k
Top-p
<span class="value-display" id="top-p-val">0.0</span>
</label>
<input id="top-p" max="1" min="0" step="0.05" type="range" value="0">
Expand Down
4 changes: 2 additions & 2 deletions src/lib.zig
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
//! ## ZigFormer
//!
//! ZigFormer is an implementation of a transformer-based language model (like GPT-2)
//! ZigFormer is an implementation of a transformer-based language model (LLM)
//! written in pure Zig. It provides a framework for understanding and experimenting with
//! transformer architectures.
//!
Expand Down Expand Up @@ -40,7 +40,7 @@
//! - `self_attention`: Multi-head self-attention mechanism
//! - `feed_forward`: Position-wise feed-forward networks
//! - `layer_norm`: Layer normalization
//! - `transformer`: Transformer block (attention and FFN)
//! - `transformer`: Transformer block (feed-forward and attention layers)
//! - `output_projection`: Output layer for vocabulary prediction
//! - `llm`: Language model implementation with training and inference API
//! - `optimizer`: Adam optimizer
Expand Down
18 changes: 0 additions & 18 deletions src/zigformer/llm.zig
Original file line number Diff line number Diff line change
Expand Up @@ -674,24 +674,6 @@ pub const LLM = struct {
}

pub fn setBatchSize(self: *LLM, batch_size: usize) void {
// Iterate over layers and set batch_size for SelfAttention layers
// Note: We need to know which layers are SelfAttention.
// In our simple structure, we know layers 1, 2, 3 are TransformerBlocks.
// TransformerBlock contains SelfAttention.
// But Layer is type-erased.
// Ideally, we should add setBatchSize to Layer vtable, but that's a big change.
// For now, we'll rely on the known structure and pointer casting, which is risky but fits the current style.
// Actually, TransformerBlock has a setBatchSize method we should add.
// Let's assume we add setBatchSize to TransformerBlock and call it here.

// Wait, we can't easily cast opaque pointers back to types without RTTI or knowing the type.
// Given the fixed structure:
// Layer 0: Embeddings (no batch_size needed)
// Layer 1: TransformerBlock
// Layer 2: TransformerBlock
// Layer 3: TransformerBlock
// Layer 4: OutputProjection (no batch_size needed)

const embeddings: *Embeddings = @ptrCast(@alignCast(self.network.items[0].self));
embeddings.setBatchSize(batch_size);

Expand Down
7 changes: 0 additions & 7 deletions src/zigformer/self_attention.zig
Original file line number Diff line number Diff line change
Expand Up @@ -635,13 +635,6 @@ test "SelfAttention Causal Masking with Cache" {
var output = try attn.forward(input, true);
defer output.deinit();

// Check cached_attention_scores
// Should be (num_heads * seq_len) x seq_len
// For each head, the 4x4 score matrix should be lower triangular (masked).
// scores[i, j] should be -inf (or very small after softmax, but we check pre-softmax scores if possible?
// Wait, cached_attention_scores stores POST-softmax scores.
// So masked values should be 0.0.

const scores = attn.cached_attention_scores;
// Rows: num_heads * seq_len
// Cols: seq_len (since cache_len = seq_len after first pass)
Expand Down
Loading