Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions tabby/Models/LlamaRuntimeModels.swift
Original file line number Diff line number Diff line change
Expand Up @@ -188,13 +188,21 @@ enum LlamaRuntimeError: LocalizedError {
case unavailable(String)
case cancelled
case generationFailed(String)
/// Signals that the first-token confidence gate aborted generation because the model's top-1
/// raw-logit softmax probability at position 0 was below the configured threshold.
/// The engine layer treats this as a normal "no suggestion" outcome rather than a user-facing
/// failure. Carrying probability/threshold/token in the case lets diagnostics distinguish a
/// suppressed-by-confidence empty from any other empty result.
case lowConfidenceSuppression(probability: Float, threshold: Double, token: String)

var errorDescription: String? {
switch self {
case .unavailable(let message), .generationFailed(let message):
return message
case .cancelled:
return "Runtime work was cancelled."
case let .lowConfidenceSuppression(probability, threshold, _):
return "Suggestion suppressed: first-token confidence \(probability) is below threshold \(threshold)."
}
}
}
9 changes: 9 additions & 0 deletions tabby/Models/SuggestionEngineModels.swift
Original file line number Diff line number Diff line change
Expand Up @@ -83,4 +83,13 @@ struct SuggestionSettingsSnapshot: Equatable, Sendable {
/// on the first generated token, preventing conversational openers from appearing in
/// inline autocomplete suggestions.
let isFirstTokenGatingEnabled: Bool
/// When true, the llama runtime measures top-1 probability of the raw-logit softmax at
/// position 0 and silently suppresses the suggestion if it falls below
/// `firstTokenConfidenceThreshold`. Distinct from gating: gating *masks* specific tokens,
/// confidence suppression *aborts* the whole suggestion when the model is uncertain.
let isFirstTokenConfidenceGatingEnabled: Bool
/// Probability threshold in [0, 1]. The suggestion is suppressed when the model's top-1
/// raw-logit softmax probability at position 0 is below this value. 0 disables in practice
/// (any probability >= 0 passes).
let firstTokenConfidenceThreshold: Double
}
6 changes: 6 additions & 0 deletions tabby/Models/SuggestionModels.swift
Original file line number Diff line number Diff line change
Expand Up @@ -257,6 +257,12 @@ struct SuggestionRequest: Equatable, Sendable {
/// on the very first sampled token. This is a llama-only feature — Apple Intelligence does
/// not expose logit-level control.
let isFirstTokenGatingEnabled: Bool
/// When true, the llama runtime measures the top-1 raw-logit softmax probability at the
/// first token position and aborts (returns no suggestion) when it falls below
/// `firstTokenConfidenceThreshold`. llama-only.
let isFirstTokenConfidenceGatingEnabled: Bool
/// Probability threshold in [0, 1] used by `isFirstTokenConfidenceGatingEnabled`.
let firstTokenConfidenceThreshold: Double
}

/// The engine's normalized response, including raw model text for debugging.
Expand Down
110 changes: 91 additions & 19 deletions tabby/Models/SuggestionSettingsModel.swift
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,15 @@ final class SuggestionSettingsModel: ObservableObject {
/// This prevents instruction-tuned models from starting suggestions with conversational
/// openers that belong in a chat reply, not in inline autocomplete.
@Published private(set) var isFirstTokenGatingEnabled: Bool
/// When enabled, the llama runtime measures the top-1 raw-logit softmax probability of the
/// first sampled token and silently suppresses the whole suggestion if it falls below
/// `firstTokenConfidenceThreshold`. This is a *separate* axis from chat-opener gating:
/// gating masks specific tokens; confidence suppression aborts generation entirely when
/// the model's own distribution is too flat to produce a trustworthy continuation.
@Published private(set) var isFirstTokenConfidenceGatingEnabled: Bool
/// Probability threshold in [0, 1]. Higher values are stricter (more suggestions are
/// suppressed). 0 effectively disables the gate even when the toggle is on.
@Published private(set) var firstTokenConfidenceThreshold: Double

private let userDefaults: UserDefaults

Expand All @@ -38,6 +47,14 @@ final class SuggestionSettingsModel: ObservableObject {
private static let selectedLocalPromptModeDefaultsKey = "selectedLocalSuggestionPromptMode"
private static let customAIInstructionsDefaultsKey = "tabbyCustomAIInstructions"
private static let isFirstTokenGatingEnabledDefaultsKey = "tabbyFirstTokenGatingEnabled"
private static let confidenceGatingEnabledDefaultsKey = "tabbyFirstTokenConfidenceGatingEnabled"
private static let confidenceThresholdDefaultsKey = "tabbyFirstTokenConfidenceThreshold"

/// 0.10 is a deliberately gentle starting point: our local models often peak at ~0.30-0.60
/// for unambiguous continuations, so this threshold catches only the genuinely-confused
/// cases (e.g. the model sees the prompt as ambiguous and spreads probability widely).
/// We expect to tune this once telemetry from the `first-token-confidence` log accumulates.
private static let defaultFirstTokenConfidenceThreshold: Double = 0.10

init(
configuration: SuggestionConfiguration,
Expand Down Expand Up @@ -74,6 +91,18 @@ final class SuggestionSettingsModel: ObservableObject {
}
// Default to enabled — first-token gating is a net positive for all known instruct models.
let resolvedFirstTokenGatingEnabled = userDefaults.object(forKey: Self.isFirstTokenGatingEnabledDefaultsKey) as? Bool ?? true
// Default off until we've seen field telemetry. The deny-list gate ships on by default
// because it's evidence-backed and surgical; confidence suppression is heuristic and can
// hide useful suggestions when the threshold is mistuned, so users opt in explicitly.
let resolvedConfidenceGatingEnabled = userDefaults
.object(forKey: Self.confidenceGatingEnabledDefaultsKey) as? Bool ?? false
let resolvedConfidenceThreshold: Double = {
guard userDefaults.object(forKey: Self.confidenceThresholdDefaultsKey) != nil else {
return Self.defaultFirstTokenConfidenceThreshold
}
let raw = userDefaults.double(forKey: Self.confidenceThresholdDefaultsKey)
return min(max(raw, 0.0), 1.0)
}()

isGloballyEnabled = resolvedGloballyEnabled
disabledAppRules = resolvedDisabledAppRules
Expand All @@ -84,6 +113,8 @@ final class SuggestionSettingsModel: ObservableObject {
selectedLocalPromptMode = resolvedLocalPromptMode
customAIInstructions = resolvedCustomAIInstructions
isFirstTokenGatingEnabled = resolvedFirstTokenGatingEnabled
isFirstTokenConfidenceGatingEnabled = resolvedConfidenceGatingEnabled
firstTokenConfidenceThreshold = resolvedConfidenceThreshold

userDefaults.set(resolvedGloballyEnabled, forKey: Self.isGloballyEnabledDefaultsKey)
persistDisabledAppRules(resolvedDisabledAppRules)
Expand All @@ -94,6 +125,8 @@ final class SuggestionSettingsModel: ObservableObject {
persistSelectedLocalPromptMode(resolvedLocalPromptMode)
persistCustomAIInstructions(resolvedCustomAIInstructions)
userDefaults.set(resolvedFirstTokenGatingEnabled, forKey: Self.isFirstTokenGatingEnabledDefaultsKey)
userDefaults.set(resolvedConfidenceGatingEnabled, forKey: Self.confidenceGatingEnabledDefaultsKey)
userDefaults.set(resolvedConfidenceThreshold, forKey: Self.confidenceThresholdDefaultsKey)
}

/// Compatibility shim for legacy call sites while the UI migrates from the old toggle to the
Expand Down Expand Up @@ -121,7 +154,9 @@ final class SuggestionSettingsModel: ObservableObject {
selectedWordCountPreset: selectedWordCountPreset,
effectivePromptMode: effectivePromptMode,
customAIInstructions: CustomAIInstructionFormatter.normalized(customAIInstructions),
isFirstTokenGatingEnabled: isFirstTokenGatingEnabled
isFirstTokenGatingEnabled: isFirstTokenGatingEnabled,
isFirstTokenConfidenceGatingEnabled: isFirstTokenConfidenceGatingEnabled,
firstTokenConfidenceThreshold: firstTokenConfidenceThreshold
)
}

Expand Down Expand Up @@ -282,6 +317,27 @@ final class SuggestionSettingsModel: ObservableObject {
userDefaults.set(enabled, forKey: Self.isFirstTokenGatingEnabledDefaultsKey)
}

func setFirstTokenConfidenceGatingEnabled(_ enabled: Bool) {
guard isFirstTokenConfidenceGatingEnabled != enabled else {
return
}

isFirstTokenConfidenceGatingEnabled = enabled
userDefaults.set(enabled, forKey: Self.confidenceGatingEnabledDefaultsKey)
}

func setFirstTokenConfidenceThreshold(_ threshold: Double) {
// Clamp at the setter boundary so any UI bug (slider out of range, manual defaults edit)
// cannot corrupt persisted state. The runtime layer trusts this value as already-valid.
let clamped = min(max(threshold, 0.0), 1.0)
guard firstTokenConfidenceThreshold != clamped else {
return
}

firstTokenConfidenceThreshold = clamped
userDefaults.set(clamped, forKey: Self.confidenceThresholdDefaultsKey)
}

private static func effectivePromptMode(
engine: SuggestionEngineKind,
localPromptMode: SuggestionPromptMode
Expand Down Expand Up @@ -423,7 +479,12 @@ final class SuggestionSettingsModel: ObservableObject {

extension SuggestionSettingsModel: SuggestionSettingsProviding {
var snapshotPublisher: AnyPublisher<SuggestionSettingsSnapshot, Never> {
Publishers.CombineLatest4(
// Combine maxes out at four upstreams per operator, but the snapshot now depends on nine
// published values. We split them into two logical bundles — a "core" group of always-on
// selections and a "first-token" group of llama-only gating settings — then CombineLatest
// those two intermediate publishers. Equality on the bundles via removeDuplicates makes
// the downstream snapshot still emit only on real change.
let coreSelections = Publishers.CombineLatest4(
Publishers.CombineLatest4(
$isGloballyEnabled,
$disabledAppRules,
Expand All @@ -434,22 +495,33 @@ extension SuggestionSettingsModel: SuggestionSettingsProviding {
$customAIInstructions,
$isFirstTokenGatingEnabled
)
.map { combinedSettings, localPromptMode, customAIInstructions, firstTokenGatingEnabled in
let (globallyEnabled, disabledAppRules, engine, wordCountPreset) = combinedSettings
return SuggestionSettingsSnapshot(
isGloballyEnabled: globallyEnabled,
disabledAppBundleIdentifiers: Set(disabledAppRules.map(\.bundleIdentifier)),
selectedEngine: engine,
selectedWordCountPreset: wordCountPreset,
effectivePromptMode: Self.effectivePromptMode(
engine: engine,
localPromptMode: localPromptMode
),
customAIInstructions: CustomAIInstructionFormatter.normalized(customAIInstructions),
isFirstTokenGatingEnabled: firstTokenGatingEnabled
)
}
.removeDuplicates()
.eraseToAnyPublisher()

let confidenceSelections = Publishers.CombineLatest(
$isFirstTokenConfidenceGatingEnabled,
$firstTokenConfidenceThreshold
)

return Publishers.CombineLatest(coreSelections, confidenceSelections)
.map { coreTuple, confidenceTuple in
let (combinedSettings, localPromptMode, customAIInstructions, firstTokenGatingEnabled) = coreTuple
let (globallyEnabled, disabledAppRules, engine, wordCountPreset) = combinedSettings
let (confidenceGatingEnabled, confidenceThreshold) = confidenceTuple
return SuggestionSettingsSnapshot(
isGloballyEnabled: globallyEnabled,
disabledAppBundleIdentifiers: Set(disabledAppRules.map(\.bundleIdentifier)),
selectedEngine: engine,
selectedWordCountPreset: wordCountPreset,
effectivePromptMode: Self.effectivePromptMode(
engine: engine,
localPromptMode: localPromptMode
),
customAIInstructions: CustomAIInstructionFormatter.normalized(customAIInstructions),
isFirstTokenGatingEnabled: firstTokenGatingEnabled,
isFirstTokenConfidenceGatingEnabled: confidenceGatingEnabled,
firstTokenConfidenceThreshold: confidenceThreshold
)
}
.removeDuplicates()
.eraseToAnyPublisher()
}
}
104 changes: 103 additions & 1 deletion tabby/Services/Runtime/LlamaRuntimeCore.swift
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,16 @@ actor LlamaRuntimeCore {
category: "first-token-gate"
)

/// Filterable signal for first-token confidence-based suppression. Distinct category from the
/// gate logger because these are *separate signals*: gating masks specific tokens; confidence
/// suppression aborts the whole suggestion when the model's distribution at position 0 is too
/// flat. A single generation can fire neither, one, or both.
/// log stream --predicate 'subsystem == "app.tabby" AND category == "first-token-confidence"'
private static let firstTokenConfidenceLogger = Logger(
subsystem: "app.tabby",
category: "first-token-confidence"
)

private var backendInitialized = false
private var model: OpaquePointer?
private var preparedRuntime: PreparedLlamaRuntime?
Expand Down Expand Up @@ -162,7 +172,9 @@ actor LlamaRuntimeCore {
minP: Double,
repetitionPenalty: Double,
seed: UInt32? = nil,
firstTokenGatingEnabled: Bool = true
firstTokenGatingEnabled: Bool = true,
firstTokenConfidenceGatingEnabled: Bool = false,
firstTokenConfidenceThreshold: Double = 0.0
) throws -> String {
guard let preparedRuntime else {
throw LlamaRuntimeError.unavailable("The llama model is not loaded.")
Expand Down Expand Up @@ -257,6 +269,21 @@ actor LlamaRuntimeCore {
logFirstTokenGateFireIfNeeded(context: context, vocab: vocab)
}

// Confidence gating runs *before* sampling so we can abort the whole generation
// (and avoid burning a sampled token + decode) when the model's distribution at
// position 0 is too flat. The signal is the top-1 probability of the softmax over
// the raw logits — not the post-sampler distribution — because temperature/top-p
// shape the sampler's output, not the model's actual confidence.
if tokenIndex == 0 && firstTokenConfidenceGatingEnabled {
if let suppression = lowConfidenceSuppressionIfNeeded(
context: context,
vocab: vocab,
threshold: firstTokenConfidenceThreshold
) {
throw suppression
}
}

let nextToken = llama_sampler_sample(activeSampler, context, -1)
if nextToken == llama_vocab_eos(vocab) || llama_vocab_is_eog(vocab, nextToken) {
break
Expand All @@ -282,6 +309,14 @@ actor LlamaRuntimeCore {
try decodeToken(nextToken, position: position, in: context)
position += 1
}
} catch let error as LlamaRuntimeError {
// Confidence suppression is a clean abort — the prompt KV is still valid and the next
// request can reuse it. Reserve the cache reset for genuine generation failures.
if case .lowConfidenceSuppression = error {
throw error
}
shouldResetPromptCache = true
throw error
} catch {
shouldResetPromptCache = true
throw error
Expand Down Expand Up @@ -805,6 +840,73 @@ actor LlamaRuntimeCore {
)
}

/// Checks the model's confidence at position 0 and returns a suppression error if it is too
/// low. Confidence is defined as the **top-1 probability of the softmax over the raw logits**
/// at the last context position — i.e. how peaked the model's actual distribution is, before
/// any sampler-chain transforms.
///
/// We deliberately don't use the *sampled* token's post-transform probability: temperature,
/// top-p, and min-p reshape the distribution, so a sampled-token probability of 0.9 after
/// top-p can correspond to a raw distribution where the true top-1 was 0.05 (the model was
/// confused, but the sampler concentrated mass on a survivor). For inline autocomplete we
/// want to suppress when *the model itself* was uncertain, not when the sampler happened to
/// be confident about a leftover.
///
/// Implementation note: we compute softmax in a numerically-stable way (subtract max logit
/// before exp) over the full vocabulary. This is one O(nVocab) pass — same cost as the gate
/// argmax — and it only runs once per generation when confidence gating is enabled.
private func lowConfidenceSuppressionIfNeeded(
context: OpaquePointer,
vocab: OpaquePointer,
threshold: Double
) -> LlamaRuntimeError? {
guard let logits = llama_get_logits_ith(context, -1) else {
return nil
}

let nVocab = Int(llama_vocab_n_tokens(vocab))
guard nVocab > 0 else { return nil }

var maxLogit: Float = -.infinity
var argmaxTokenID: llama_token = 0
for tokenID in 0 ..< nVocab {
let value = logits[tokenID]
if value > maxLogit {
maxLogit = value
argmaxTokenID = llama_token(tokenID)
}
}

// Numerically-stable softmax: subtract the max before exponentiating so we don't overflow
// float on large logits. The probability of the argmax token is then
// exp(0) / sum(exp(logit_i - max)) = 1 / sum(exp(logit_i - max))
var expSum: Double = 0
for tokenID in 0 ..< nVocab {
expSum += Double(exp(logits[tokenID] - maxLogit))
}
guard expSum > 0 else { return nil }

let topProbability = Float(1.0 / expSum)

guard Double(topProbability) < threshold else {
return nil
}

let piece = pieceString(for: argmaxTokenID, vocab: vocab)
let escaped = piece
.replacingOccurrences(of: "\n", with: "\\n")
.replacingOccurrences(of: "\t", with: "\\t")
Self.firstTokenConfidenceLogger.debug(
"suppressed: top-1 token \(argmaxTokenID, privacy: .public) (\"\(escaped, privacy: .public)\") prob=\(topProbability, privacy: .public) threshold=\(threshold, privacy: .public)"
)

return .lowConfidenceSuppression(
probability: topProbability,
threshold: threshold,
token: piece
)
}

/// Tokenizes a short string and returns just the first token, without BOS.
/// Returns nil if tokenization fails or produces no tokens.
private func tokenizeFirstToken(_ text: String, vocab: OpaquePointer) -> llama_token? {
Expand Down
Loading