Skip to content
Merged
90 changes: 90 additions & 0 deletions src/hpc/gguf.rs
Original file line number Diff line number Diff line change
Expand Up @@ -225,6 +225,12 @@ pub fn read_tensor_f32<R: Read + Seek>(
GgmlType::Q8_0 => {
dequantize_q8_0(reader, n_elements)
}
GgmlType::Q4_0 => {
dequantize_q4_0(reader, n_elements)
}
GgmlType::Q4_K => {
dequantize_q4_k(reader, n_elements)
}
other => Err(format!("Unsupported dtype for dequantization: {:?}", other)),
}
}
Expand Down Expand Up @@ -317,6 +323,90 @@ fn dequantize_q8_0<R: Read>(r: &mut R, n_elements: usize) -> Result<Vec<f32>, St
Ok(result)
}

/// Dequantize Q4_0: each block = 2 bytes scale (f16) + 16 bytes (32 nibbles).
fn dequantize_q4_0<R: Read>(r: &mut R, n_elements: usize) -> Result<Vec<f32>, String> {
let block_size = 32;
let n_blocks = (n_elements + block_size - 1) / block_size;
let mut result = Vec::with_capacity(n_elements);

for _ in 0..n_blocks {
let mut scale_buf = [0u8; 2];
r.read_exact(&mut scale_buf).map_err(|e| e.to_string())?;
let scale = f16_to_f32(u16::from_le_bytes(scale_buf));

let mut nibbles = [0u8; 16];
r.read_exact(&mut nibbles).map_err(|e| e.to_string())?;

for &byte in &nibbles {
let lo = (byte & 0x0F) as i8 - 8;
let hi = ((byte >> 4) & 0x0F) as i8 - 8;
result.push(lo as f32 * scale);
result.push(hi as f32 * scale);
}
}

result.truncate(n_elements);
Ok(result)
}

/// Dequantize Q4_K: super-blocks of 256 elements.
///
/// Q4_K block layout (144 bytes for 256 elements):
/// - 2 bytes: d (f16 scale)
/// - 2 bytes: dmin (f16 min)
/// - 12 bytes: scales (6-bit per sub-block, packed)
/// - 128 bytes: 256 4-bit quantized values (nibbles)
fn dequantize_q4_k<R: Read>(r: &mut R, n_elements: usize) -> Result<Vec<f32>, String> {
let block_size = 256;
let n_blocks = (n_elements + block_size - 1) / block_size;
let mut result = Vec::with_capacity(n_elements);

for _ in 0..n_blocks {
// Read d and dmin (f16)
let mut d_buf = [0u8; 2];
let mut dmin_buf = [0u8; 2];
r.read_exact(&mut d_buf).map_err(|e| e.to_string())?;
r.read_exact(&mut dmin_buf).map_err(|e| e.to_string())?;
let d = f16_to_f32(u16::from_le_bytes(d_buf));
let dmin = f16_to_f32(u16::from_le_bytes(dmin_buf));

// Read scales (12 bytes = 8 sub-block scales + 8 sub-block mins, 6-bit packed)
let mut scales_raw = [0u8; 12];
r.read_exact(&mut scales_raw).map_err(|e| e.to_string())?;

// Decode 8 scale/min pairs from 12 bytes (6 bits each)
let mut sc = [0u8; 8];
let mut mn = [0u8; 8];
for i in 0..4 {
sc[i] = scales_raw[i] & 0x3F;
mn[i] = scales_raw[i + 4] & 0x3F;
sc[i + 4] = ((scales_raw[i + 8] & 0x0F) << 2) | (scales_raw[i] >> 6);
mn[i + 4] = ((scales_raw[i + 8] >> 4) << 2) | (scales_raw[i + 4] >> 6);
}

// Read 128 bytes of nibbles (256 4-bit values)
let mut nibbles = [0u8; 128];
r.read_exact(&mut nibbles).map_err(|e| e.to_string())?;

// Dequantize: each sub-block of 32 elements
for j in 0..8 {
let sub_d = d * sc[j] as f32;
let sub_m = dmin * mn[j] as f32;
let nib_offset = j * 16;
for k in 0..16 {
let byte = nibbles[nib_offset + k];
let lo = (byte & 0x0F) as f32;
let hi = ((byte >> 4) & 0x0F) as f32;
result.push(lo * sub_d - sub_m);
result.push(hi * sub_d - sub_m);
}
}
}

result.truncate(n_elements);
Ok(result)
}

/// Convert f16 bit pattern to f32.
fn f16_to_f32(bits: u16) -> f32 {
let sign = ((bits >> 15) & 1) as u32;
Expand Down
252 changes: 252 additions & 0 deletions src/hpc/gpt2/api.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,252 @@
//! OpenAI-compatible API types for GPT-2 inference.
//!
//! Provides request/response structs matching the OpenAI API surface:
//! - `/v1/completions` — text completion
//! - `/v1/embeddings` — token embeddings via wte
//! - `/v1/models` — model listing
//!
//! These types are transport-agnostic — they serialize/deserialize
//! but don't depend on any HTTP framework.

use super::inference::{GeneratedToken, Gpt2Engine};
use super::weights::*;

// ============================================================================
// /v1/completions
// ============================================================================

/// Request body for /v1/completions.
#[derive(Clone, Debug)]
pub struct CompletionRequest {
/// Model name (ignored — we only have gpt2).
pub model: String,
/// Input text prompt (will be tokenized externally).
pub prompt_tokens: Vec<u32>,
/// Maximum tokens to generate.
pub max_tokens: usize,
/// Sampling temperature (1.0 = greedy effective).
pub temperature: f32,
/// Stop token ID (default: 50256 = <|endoftext|>).
pub stop_token: Option<u32>,
}

impl Default for CompletionRequest {
fn default() -> Self {
Self {
model: "gpt2".into(),
prompt_tokens: Vec::new(),
max_tokens: 128,
temperature: 1.0,
stop_token: Some(50256),
}
}
}

/// Single completion choice.
#[derive(Clone, Debug)]
pub struct CompletionChoice {
pub index: usize,
pub tokens: Vec<GeneratedToken>,
pub finish_reason: FinishReason,
}

/// Why generation stopped.
#[derive(Clone, Debug, PartialEq, Eq)]
pub enum FinishReason {
Stop,
Length,
}

/// Response body for /v1/completions.
#[derive(Clone, Debug)]
pub struct CompletionResponse {
pub id: String,
pub model: String,
pub choices: Vec<CompletionChoice>,
pub usage: Usage,
}

/// Token usage statistics.
#[derive(Clone, Debug, Default)]
pub struct Usage {
pub prompt_tokens: usize,
pub completion_tokens: usize,
pub total_tokens: usize,
}

// ============================================================================
// /v1/embeddings
// ============================================================================

/// Request body for /v1/embeddings.
#[derive(Clone, Debug)]
pub struct EmbeddingRequest {
pub model: String,
/// Token IDs to embed (one embedding per token).
pub input_tokens: Vec<u32>,
}

/// Single embedding result.
#[derive(Clone, Debug)]
pub struct EmbeddingData {
pub index: usize,
pub embedding: Vec<f32>,
}

/// Response body for /v1/embeddings.
#[derive(Clone, Debug)]
pub struct EmbeddingResponse {
pub model: String,
pub data: Vec<EmbeddingData>,
pub usage: Usage,
}

// ============================================================================
// /v1/models
// ============================================================================

/// Model info for /v1/models.
#[derive(Clone, Debug)]
pub struct ModelInfo {
pub id: String,
pub owned_by: String,
pub vocab_size: usize,
pub embed_dim: usize,
pub num_layers: usize,
pub num_heads: usize,
pub max_seq_len: usize,
}

impl ModelInfo {
/// GPT-2 small (124M) model info.
pub fn gpt2_small() -> Self {
Self {
id: "gpt2".into(),
owned_by: "adaworldapi".into(),
vocab_size: VOCAB_SIZE,
embed_dim: EMBED_DIM,
num_layers: NUM_LAYERS,
num_heads: NUM_HEADS,
max_seq_len: MAX_SEQ_LEN,
}
}
}

// ============================================================================
// Engine wrapper — stateless API over stateful engine
// ============================================================================

/// Stateless API wrapper around Gpt2Engine.
/// Handles request→response conversion.
pub struct Gpt2Api {
engine: Gpt2Engine,
request_counter: u64,
}

impl Gpt2Api {
/// Create from pre-loaded weights.
pub fn new(weights: Gpt2Weights) -> Self {
Self {
engine: Gpt2Engine::new(weights),
request_counter: 0,
}
}

/// /v1/completions handler.
pub fn complete(&mut self, req: &CompletionRequest) -> CompletionResponse {
self.request_counter += 1;

let generated = self.engine.generate(
&req.prompt_tokens,
req.max_tokens,
req.temperature,
);

let finish_reason = if generated.len() < req.max_tokens {
FinishReason::Stop
} else {
FinishReason::Length
};

let completion_tokens = generated.len();
let prompt_tokens = req.prompt_tokens.len();

CompletionResponse {
id: format!("cmpl-{}", self.request_counter),
model: "gpt2".into(),
choices: vec![CompletionChoice {
index: 0,
tokens: generated,
finish_reason,
}],
usage: Usage {
prompt_tokens,
completion_tokens,
total_tokens: prompt_tokens + completion_tokens,
},
}
}

/// /v1/embeddings handler — returns wte embeddings for token IDs.
pub fn embed(&self, req: &EmbeddingRequest) -> EmbeddingResponse {
let mut data = Vec::with_capacity(req.input_tokens.len());

for (idx, &token_id) in req.input_tokens.iter().enumerate() {
let offset = token_id as usize * EMBED_DIM;
let embedding = self.engine.weights().wte[offset..offset + EMBED_DIM].to_vec();
data.push(EmbeddingData {
index: idx,
embedding,
});
}

EmbeddingResponse {
model: "gpt2".into(),
data,
usage: Usage {
prompt_tokens: req.input_tokens.len(),
completion_tokens: 0,
total_tokens: req.input_tokens.len(),
},
}
}

/// /v1/models handler.
pub fn model_info(&self) -> ModelInfo {
ModelInfo::gpt2_small()
}

/// Access the underlying engine (for advanced usage).
pub fn engine_mut(&mut self) -> &mut Gpt2Engine {
&mut self.engine
}
}

#[cfg(test)]
mod tests {
use super::*;

#[test]
fn test_model_info() {
let info = ModelInfo::gpt2_small();
assert_eq!(info.vocab_size, 50257);
assert_eq!(info.embed_dim, 768);
assert_eq!(info.num_layers, 12);
assert_eq!(info.num_heads, 12);
assert_eq!(info.max_seq_len, 1024);
}

#[test]
fn test_completion_request_default() {
let req = CompletionRequest::default();
assert_eq!(req.max_tokens, 128);
assert_eq!(req.temperature, 1.0);
assert_eq!(req.stop_token, Some(50256));
}

#[test]
fn test_finish_reason_variants() {
assert_eq!(FinishReason::Stop, FinishReason::Stop);
assert_ne!(FinishReason::Stop, FinishReason::Length);
}
}
Loading
Loading