From f865fdcdc97e5bf4b05d757a23a60cd3f27df1ca Mon Sep 17 00:00:00 2001 From: Jacob Fu <141651335+FuJacob@users.noreply.github.com> Date: Mon, 1 Jun 2026 20:46:18 -0700 Subject: [PATCH 1/2] Add token-aware prompt budgeting as an opt-in path MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The base-model prompt is budgeted in characters as a deliberate ~4-chars-per-token approximation. That ratio is far off for code and non-Latin text, where it can under- or over-fill the real context window. This adds a token-aware path that swaps in an estimated token count, exactly as PromptSectionBudget's own comment anticipated, without paying for the runtime tokenizer on the main-actor prompt path. - TokenCountEstimator is a pure, cheap, word-aware heuristic (roughly four characters per token within a word, every word at least one token) — closer to real subword tokenization than a single global ratio, deterministic for tests. - PromptSectionBudget gains an additive allocate(_:totalTokens:estimate:) that fills by priority against an estimated-token budget, converting each section's token cap to a character cap via that content's own density so the existing character-based truncate is reused unchanged. The character allocate is untouched. - BaseCompletionPromptRenderer takes an optional tokenBudget; nil keeps the character path, so shipped behavior is unchanged. The estimator, the token allocator, and the renderer's token path are all unit-tested (the caret prefix stays un-starved under a tight token budget). Wiring a caller to pass a real token budget is the follow-up: the right budget value and the quality delta need on-device validation, so it stays opt-in until then. --- Cotabby.xcodeproj/project.pbxproj | 8 ++++ .../BaseCompletionPromptRenderer.swift | 17 +++++++- Cotabby/Support/PromptSectionBudget.swift | 43 +++++++++++++++++++ Cotabby/Support/TokenCountEstimator.swift | 23 ++++++++++ .../BaseCompletionPromptRendererTests.swift | 16 +++++++ CotabbyTests/PromptSectionBudgetTests.swift | 35 +++++++++++++++ CotabbyTests/TokenCountEstimatorTests.swift | 36 ++++++++++++++++ 7 files changed, 176 insertions(+), 2 deletions(-) create mode 100644 Cotabby/Support/TokenCountEstimator.swift create mode 100644 CotabbyTests/TokenCountEstimatorTests.swift diff --git a/Cotabby.xcodeproj/project.pbxproj b/Cotabby.xcodeproj/project.pbxproj index ab32d71..e83bc29 100644 --- a/Cotabby.xcodeproj/project.pbxproj +++ b/Cotabby.xcodeproj/project.pbxproj @@ -184,6 +184,7 @@ A0657CE0488F69F0BD559CBC /* SuggestionCoordinator+Acceptance.swift in Sources */ = {isa = PBXBuildFile; fileRef = 72B13136DF7318F3E96DF0D3 /* SuggestionCoordinator+Acceptance.swift */; }; A0BB87E3665EF6C209034798 /* GhostSuggestionLayoutTests.swift in Sources */ = {isa = PBXBuildFile; fileRef = 5AD3F4F9FBE82007E4E15F58 /* GhostSuggestionLayoutTests.swift */; }; A147C5EC3F2214A670F7556E /* FocusPollBackoffTests.swift in Sources */ = {isa = PBXBuildFile; fileRef = 273B4DC844F79B4BE2C8910F /* FocusPollBackoffTests.swift */; }; + A26E14A6E73036222419C424 /* TokenCountEstimatorTests.swift in Sources */ = {isa = PBXBuildFile; fileRef = B78AA11B52A6588119ABF76F /* TokenCountEstimatorTests.swift */; }; A2B3F4D38BCB0FED452B2A3F /* FocusTrackingModel.swift in Sources */ = {isa = PBXBuildFile; fileRef = B6D42CD456B4B3C988B148A6 /* FocusTrackingModel.swift */; }; A36481222BB5B2A67349D389 /* ApplicationBundleMetadataTests.swift in Sources */ = {isa = PBXBuildFile; fileRef = A168A7B6A7AD11559B60C56B /* ApplicationBundleMetadataTests.swift */; }; A5A6CE0EF01CA6A9AFA7A400 /* RequestID.swift in Sources */ = {isa = PBXBuildFile; fileRef = 6DC693E00430F46E41CB56E6 /* RequestID.swift */; }; @@ -234,6 +235,7 @@ DA23422A2CF77CFD3B1283C8 /* OnboardingTemplateFeatureListTests.swift in Sources */ = {isa = PBXBuildFile; fileRef = D814BBA41CF29E8DD9954651 /* OnboardingTemplateFeatureListTests.swift */; }; DA2A22F5386CC25420E98E6C /* FillInMiddlePolicy.swift in Sources */ = {isa = PBXBuildFile; fileRef = 276FA037D0F8DF51AABF4292 /* FillInMiddlePolicy.swift */; }; DB1310FF3576ACA6472C4DB1 /* TrailingDuplicationFilterTests.swift in Sources */ = {isa = PBXBuildFile; fileRef = E19A5B462891263BDFB56607 /* TrailingDuplicationFilterTests.swift */; }; + DC84D6A6A2F9A1060CD20ABB /* TokenCountEstimator.swift in Sources */ = {isa = PBXBuildFile; fileRef = 1BA30E71C21C77BB6EA4C166 /* TokenCountEstimator.swift */; }; DCABB8D2B391C7820D6CA5FF /* InsertionSafetyGate.swift in Sources */ = {isa = PBXBuildFile; fileRef = 7D472F9F396672E57873303B /* InsertionSafetyGate.swift */; }; DD7FA343F1C21C4569F6D181 /* ScreenshotContextGenerator.swift in Sources */ = {isa = PBXBuildFile; fileRef = 9B84BAE361626891F19DC9DB /* ScreenshotContextGenerator.swift */; }; DDEDCBAA2196303455F6926A /* AcceptanceModePickerView.swift in Sources */ = {isa = PBXBuildFile; fileRef = E5DAF68AEBFE334F68A65E82 /* AcceptanceModePickerView.swift */; }; @@ -314,6 +316,7 @@ 19BE12C28A4AB8A4A58C2FF7 /* SettingsPaneScaffold.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = SettingsPaneScaffold.swift; sourceTree = ""; }; 19DB9558F4D3AFB108D71649 /* SuggestionStateHelperTests.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = SuggestionStateHelperTests.swift; sourceTree = ""; }; 1A8414BEB7E34F57607E37FE /* EmojiVariantResolver.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = EmojiVariantResolver.swift; sourceTree = ""; }; + 1BA30E71C21C77BB6EA4C166 /* TokenCountEstimator.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = TokenCountEstimator.swift; sourceTree = ""; }; 1BD71ECC2AE4821B643E0935 /* ConfidenceSuppressionPolicy.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ConfidenceSuppressionPolicy.swift; sourceTree = ""; }; 1CE61E74928C221B8BB261C6 /* SuggestionTextColorCodec.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = SuggestionTextColorCodec.swift; sourceTree = ""; }; 1D00A031C0D9CF2A7A2330D9 /* PermissionDragSourceView.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = PermissionDragSourceView.swift; sourceTree = ""; }; @@ -465,6 +468,7 @@ B4B4A2E2DD6733658EC05BD8 /* DownloadFileRescuer.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = DownloadFileRescuer.swift; sourceTree = ""; }; B6ACCB12E4DB32D2F2BEA567 /* PermissionHostApp.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = PermissionHostApp.swift; sourceTree = ""; }; B6D42CD456B4B3C988B148A6 /* FocusTrackingModel.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = FocusTrackingModel.swift; sourceTree = ""; }; + B78AA11B52A6588119ABF76F /* TokenCountEstimatorTests.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = TokenCountEstimatorTests.swift; sourceTree = ""; }; B7B185BA246A526CBA85E581 /* EmojiPickerPanelLayoutTests.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = EmojiPickerPanelLayoutTests.swift; sourceTree = ""; }; B81DD30EB657368AACE9625A /* InputMonitor.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = InputMonitor.swift; sourceTree = ""; }; B997EC69E1C65B1E18234221 /* BrowserAppDetector.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = BrowserAppDetector.swift; sourceTree = ""; }; @@ -839,6 +843,7 @@ C71031E8DB171047318B92FC /* SyntheticReplacePlannerTests.swift */, 43E37A7E835D3BDE6265843C /* TerminalAppDetectorTests.swift */, FC24FD54860CE6737E65EF65 /* TextDirectionDetectorTests.swift */, + B78AA11B52A6588119ABF76F /* TokenCountEstimatorTests.swift */, F394B8A6E30CC47015772089 /* TokenProfileCacheTests.swift */, E7D0BF193110927BEB865748 /* TokenProfileTests.swift */, E19A5B462891263BDFB56607 /* TrailingDuplicationFilterTests.swift */, @@ -998,6 +1003,7 @@ B424E2AC97C99D335B0D5751 /* SuggestionTextNormalizer.swift */, 7F4C4A7EAF886E0CC945BFEF /* TerminalAppDetector.swift */, 328847A0F494360033366791 /* TextDirectionDetector.swift */, + 1BA30E71C21C77BB6EA4C166 /* TokenCountEstimator.swift */, F3CEFE8C321E17BB3873C893 /* TokenProfile.swift */, E73C04A71D85B25998144F11 /* TokenProfileCache.swift */, D408D647412C59F3E692C42B /* TrailingDuplicationFilter.swift */, @@ -1289,6 +1295,7 @@ AB9C9C001F97F9D14F8B192A /* TerminalAppDetector.swift in Sources */, 96782E57CA26A16409368B69 /* TextDirectionDetector.swift in Sources */, 6014B31E2570EFFE45557E33 /* TickMarkSlider.swift in Sources */, + DC84D6A6A2F9A1060CD20ABB /* TokenCountEstimator.swift in Sources */, 8EED2B55999A119AE3B67359 /* TokenProfile.swift in Sources */, D747A2C2B49450D26C6179A7 /* TokenProfileCache.swift in Sources */, D3B43622E5A41B11E7AF527E /* TrailingDuplicationFilter.swift in Sources */, @@ -1384,6 +1391,7 @@ EF5BAB96DDADABB86F9E02D9 /* SyntheticReplacePlannerTests.swift in Sources */, DE236C9285635C686D66A2F6 /* TerminalAppDetectorTests.swift in Sources */, 5A441797D71A880A7482077D /* TextDirectionDetectorTests.swift in Sources */, + A26E14A6E73036222419C424 /* TokenCountEstimatorTests.swift in Sources */, D9B992A608F7FC9924D13271 /* TokenProfileCacheTests.swift in Sources */, CA8F453AA4AD02FAA8C961F7 /* TokenProfileTests.swift in Sources */, DB1310FF3576ACA6472C4DB1 /* TrailingDuplicationFilterTests.swift in Sources */, diff --git a/Cotabby/Support/BaseCompletionPromptRenderer.swift b/Cotabby/Support/BaseCompletionPromptRenderer.swift index d7c6cbf..f92ac62 100644 --- a/Cotabby/Support/BaseCompletionPromptRenderer.swift +++ b/Cotabby/Support/BaseCompletionPromptRenderer.swift @@ -27,7 +27,8 @@ enum BaseCompletionPromptRenderer { languageInstruction: String? = nil, clipboardContext: String? = nil, visualContextSummary: String? = nil, - contextBudget: Int = defaultContextBudget + contextBudget: Int = defaultContextBudget, + tokenBudget: Int? = nil ) -> String { let trimmedPrefix = Self.trimmingTrailingWhitespace(prefixText) @@ -65,7 +66,19 @@ enum BaseCompletionPromptRenderer { ) ) - let kept = PromptSectionBudget.allocate(sections, totalChars: contextBudget) + // Token-aware budgeting (opt-in): when a token budget is supplied, fill sections against an + // estimated-token window instead of the character approximation. Defaults to the character + // path so shipped behavior is unchanged. + let kept: [PromptSection] + if let tokenBudget { + kept = PromptSectionBudget.allocate( + sections, + totalTokens: tokenBudget, + estimate: TokenCountEstimator.estimate + ) + } else { + kept = PromptSectionBudget.allocate(sections, totalChars: contextBudget) + } let prefix = kept.first { $0.name == "prefix" }?.content ?? trimmedPrefix let preface = kept.filter { $0.name != "prefix" }.map(\.content) diff --git a/Cotabby/Support/PromptSectionBudget.swift b/Cotabby/Support/PromptSectionBudget.swift index 0bea6e0..a3df1a3 100644 --- a/Cotabby/Support/PromptSectionBudget.swift +++ b/Cotabby/Support/PromptSectionBudget.swift @@ -69,6 +69,49 @@ enum PromptSectionBudget { return sections.indices.compactMap { kept[$0] } } + /// Token-aware variant of `allocate`: the budget and remaining are counted in *estimated tokens* + /// (via `estimate`) instead of characters, so a base model's real context window is respected more + /// faithfully than a flat chars-per-token ratio — which matters most for code or non-Latin text, + /// where that ratio is far from four. Each section's intrinsic `minChars`/`maxChars` still bound + /// the content itself; the per-section token cap is converted to a character cap using that + /// content's own character-per-token density, so the character-based `truncate` is reused as is. + static func allocate( + _ sections: [PromptSection], + totalTokens: Int, + estimate: (String) -> Int + ) -> [PromptSection] { + var remainingTokens = max(0, totalTokens) + + let fillOrder = sections.enumerated().sorted { lhs, rhs in + lhs.element.priority == rhs.element.priority + ? lhs.offset < rhs.offset + : lhs.element.priority > rhs.element.priority + } + + var kept: [Int: PromptSection] = [:] + for (index, section) in fillOrder { + guard remainingTokens > 0 else { break } + let contentTokens = max(1, estimate(section.content)) + let charsPerToken = Double(section.content.count) / Double(contentTokens) + let remainingChars = Int((Double(remainingTokens) * charsPerToken).rounded(.down)) + let cap = min(section.maxChars, section.content.count, remainingChars) + if cap < section.minChars { + continue + } + let truncated = truncate(section.content, toChars: cap, mode: section.truncation) + .trimmingCharacters(in: .whitespacesAndNewlines) + guard !truncated.isEmpty else { + continue + } + var copy = section + copy.content = truncated + kept[index] = copy + remainingTokens -= estimate(truncated) + } + + return sections.indices.compactMap { kept[$0] } + } + /// Truncates `text` to at most `chars`, keeping the start or the end per `mode`. Returns the /// input unchanged when it already fits, and the empty string when `chars <= 0`. static func truncate(_ text: String, toChars chars: Int, mode: PromptSection.Truncation) -> String { diff --git a/Cotabby/Support/TokenCountEstimator.swift b/Cotabby/Support/TokenCountEstimator.swift new file mode 100644 index 0000000..559d549 --- /dev/null +++ b/Cotabby/Support/TokenCountEstimator.swift @@ -0,0 +1,23 @@ +import Foundation + +/// File overview: +/// A pure, cheap estimate of how many model tokens a string occupies, used to budget the base-model +/// prompt more faithfully than a flat character count without paying for a real tokenizer on the +/// main-actor prompt path. +/// +/// It is intentionally an approximation: a word-aware heuristic (roughly four characters per token +/// within a word, every word at least one token) is closer to real subword tokenization than a single +/// global chars-per-token ratio — especially for code or short function words — while staying +/// allocation-light and deterministic for tests. It is not exact, so it is used only for relative +/// budgeting decisions, never to assert a hard token limit. +enum TokenCountEstimator { + static func estimate(_ text: String) -> Int { + let words = text.split(whereSeparator: { $0.isWhitespace }) + guard !words.isEmpty else { + return 0 + } + return words.reduce(0) { total, word in + total + max(1, Int((Double(word.count) / 4.0).rounded())) + } + } +} diff --git a/CotabbyTests/BaseCompletionPromptRendererTests.swift b/CotabbyTests/BaseCompletionPromptRendererTests.swift index 89a6a44..891043c 100644 --- a/CotabbyTests/BaseCompletionPromptRendererTests.swift +++ b/CotabbyTests/BaseCompletionPromptRendererTests.swift @@ -42,6 +42,22 @@ final class BaseCompletionPromptRendererTests: XCTestCase { XCTAssertTrue(prompt.hasSuffix("the meeting is at")) } + func test_tokenBudget_keepsCaretPrefixUnderATightBudget() { + // The opt-in token-budgeted path must keep the caret prefix (top priority) at the very end, + // exactly like the character path, while a tight budget trims lower-priority context. + let prompt = BaseCompletionPromptRenderer.prompt( + prefixText: "the meeting is at", + applicationName: "Slack", + userName: "Jacob", + customRules: ["terse"], + extendedContext: "Project Matcha ships in June with a great many additional notes kept here.", + clipboardContext: "zoom link", + visualContextSummary: "Calendar: Q3 planning 3pm", + tokenBudget: 8 + ) + XCTAssertTrue(prompt.hasSuffix("the meeting is at"), "the caret prefix is never starved under a token budget") + } + func test_personaFramingConditionsOnNameStyleAndLanguage() { let prompt = BaseCompletionPromptRenderer.prompt( prefixText: "Hi team,", diff --git a/CotabbyTests/PromptSectionBudgetTests.swift b/CotabbyTests/PromptSectionBudgetTests.swift index 899ac3d..916226e 100644 --- a/CotabbyTests/PromptSectionBudgetTests.swift +++ b/CotabbyTests/PromptSectionBudgetTests.swift @@ -78,4 +78,39 @@ final class PromptSectionBudgetTests: XCTestCase { func test_truncate_returnsInputWhenItFits() { XCTAssertEqual(PromptSectionBudget.truncate("abc", toChars: 10, mode: .preserveEnd), "abc") } + + // MARK: - Token-aware allocate + + func test_tokenAllocate_keepsAllWhenBudgetAmple() { + let kept = PromptSectionBudget.allocate( + [section("a", "alpha", priority: 10), section("b", "beta", priority: 5)], + totalTokens: 1000, + estimate: TokenCountEstimator.estimate + ) + XCTAssertEqual(kept.map(\.name), ["a", "b"]) + } + + func test_tokenAllocate_dropsLowerPriorityWhenBudgetTight() { + let low = String(repeating: "word ", count: 5) + let high = String(repeating: "term ", count: 5) + let kept = PromptSectionBudget.allocate( + [section("low", low, priority: 1), section("high", high, priority: 9)], + totalTokens: 5, + estimate: TokenCountEstimator.estimate + ) + XCTAssertEqual(kept.map(\.name), ["high"]) + } + + func test_tokenAllocate_respectsTokenBudget() { + let kept = PromptSectionBudget.allocate( + [ + section("a", String(repeating: "alpha ", count: 20), priority: 9), + section("b", String(repeating: "bravo ", count: 20), priority: 8) + ], + totalTokens: 25, + estimate: TokenCountEstimator.estimate + ) + let used = kept.reduce(0) { $0 + TokenCountEstimator.estimate($1.content) } + XCTAssertLessThanOrEqual(used, 25) + } } diff --git a/CotabbyTests/TokenCountEstimatorTests.swift b/CotabbyTests/TokenCountEstimatorTests.swift new file mode 100644 index 0000000..cd12672 --- /dev/null +++ b/CotabbyTests/TokenCountEstimatorTests.swift @@ -0,0 +1,36 @@ +import XCTest +@testable import Cotabby + +/// Tests for the heuristic token-count estimator. It is deliberately approximate, so these lock down +/// robust *relationships* (empty is zero, longer text estimates more, every word counts) rather than +/// exact token counts a real tokenizer would produce. +final class TokenCountEstimatorTests: XCTestCase { + func test_emptyOrWhitespaceIsZero() { + XCTAssertEqual(TokenCountEstimator.estimate(""), 0) + XCTAssertEqual(TokenCountEstimator.estimate(" \n\t "), 0) + } + + func test_everyWordIsAtLeastOneToken() { + XCTAssertEqual(TokenCountEstimator.estimate("a"), 1) + XCTAssertGreaterThanOrEqual(TokenCountEstimator.estimate("hi there"), 2) + } + + func test_longerTextEstimatesMoreTokens() { + let short = TokenCountEstimator.estimate("the cat sat") + let long = TokenCountEstimator.estimate("the cat sat on the warm windowsill all afternoon long") + XCTAssertGreaterThan(long, short) + } + + func test_longWordCountsForMoreThanShortWord() { + XCTAssertGreaterThan( + TokenCountEstimator.estimate("internationalization"), + TokenCountEstimator.estimate("cat") + ) + } + + func test_scalesWithWordCount() { + let oneWord = TokenCountEstimator.estimate("word") + let fiveWords = TokenCountEstimator.estimate("word word word word word") + XCTAssertEqual(fiveWords, oneWord * 5) + } +} From a96eb89b87d5b9c08ddeac33f084f4c379b1fd85 Mon Sep 17 00:00:00 2001 From: Jacob Fu <141651335+FuJacob@users.noreply.github.com> Date: Mon, 1 Jun 2026 21:42:18 -0700 Subject: [PATCH 2/2] Address review feedback on token budgeting - PromptSectionBudget: clamp remainingTokens at zero. A truncated slice can be token-denser than the section average, so deducting its estimate could drive the remaining budget negative and wrongly drop the next section even when it fits. - TokenCountEstimator: split on punctuation as well as whitespace, so contractions ("can't") and punctuation-joined identifiers ("foo.bar") aren't undercounted as a single word. --- Cotabby/Support/PromptSectionBudget.swift | 5 ++++- Cotabby/Support/TokenCountEstimator.swift | 5 ++++- CotabbyTests/TokenCountEstimatorTests.swift | 7 +++++++ 3 files changed, 15 insertions(+), 2 deletions(-) diff --git a/Cotabby/Support/PromptSectionBudget.swift b/Cotabby/Support/PromptSectionBudget.swift index a3df1a3..be6e111 100644 --- a/Cotabby/Support/PromptSectionBudget.swift +++ b/Cotabby/Support/PromptSectionBudget.swift @@ -106,7 +106,10 @@ enum PromptSectionBudget { var copy = section copy.content = truncated kept[index] = copy - remainingTokens -= estimate(truncated) + // Clamp: a truncated slice can be token-denser than the section average, so deducting its + // estimate could drive `remainingTokens` negative and wrongly drop the next section even + // when it would fit. Floor at zero so over-deduction never reads as a hard stop. + remainingTokens = max(0, remainingTokens - estimate(truncated)) } return sections.indices.compactMap { kept[$0] } diff --git a/Cotabby/Support/TokenCountEstimator.swift b/Cotabby/Support/TokenCountEstimator.swift index 559d549..03191b8 100644 --- a/Cotabby/Support/TokenCountEstimator.swift +++ b/Cotabby/Support/TokenCountEstimator.swift @@ -12,7 +12,10 @@ import Foundation /// budgeting decisions, never to assert a hard token limit. enum TokenCountEstimator { static func estimate(_ text: String) -> Int { - let words = text.split(whereSeparator: { $0.isWhitespace }) + // Split on punctuation as well as whitespace: real subword tokenizers break "can't", "end.", + // and "func()" into multiple tokens, so gluing punctuation to a word would systematically + // undercount code and punctuation-heavy prose. + let words = text.split(whereSeparator: { $0.isWhitespace || $0.isPunctuation }) guard !words.isEmpty else { return 0 } diff --git a/CotabbyTests/TokenCountEstimatorTests.swift b/CotabbyTests/TokenCountEstimatorTests.swift index cd12672..18d9fac 100644 --- a/CotabbyTests/TokenCountEstimatorTests.swift +++ b/CotabbyTests/TokenCountEstimatorTests.swift @@ -33,4 +33,11 @@ final class TokenCountEstimatorTests: XCTestCase { let fiveWords = TokenCountEstimator.estimate("word word word word word") XCTAssertEqual(fiveWords, oneWord * 5) } + + func test_splitsOnPunctuationBoundaries() { + // Punctuation creates token boundaries (like real subword tokenizers), so a contraction or a + // punctuation-joined identifier estimates more tokens than the same letters with none. + XCTAssertGreaterThan(TokenCountEstimator.estimate("can't"), TokenCountEstimator.estimate("cant")) + XCTAssertGreaterThan(TokenCountEstimator.estimate("foo.bar.baz"), TokenCountEstimator.estimate("foobarbaz")) + } }