Skip to content

Commit 5529031

Browse files
feat(categorization): extract findCommonPhrases util with bigram support and tests (#803)
* feat(categorization): extract findCommonPhrases util with bigram support and tests Moves the common-phrase detection logic from CategoryBuilder.vue into a standalone src/util/categorization.ts module and adds 10 unit tests. Changes vs original PR #455: - Extracts function to util module for testability - Uses Map<string, WordEntry> (consistent with existing CategoryBuilder code) - Filters bigram components against ignored_words and length <= 2 - Removes debug console.log statements - Full TypeScript types Closes #455 * fix(categorization): snapshot word durations before bigram promotion loop Without this, promoting a bigram (e.g. 'Alpha Beta') reduces constituent word durations in-place. A later bigram that shares the middle word (e.g. 'Beta Gamma') then sees Beta.duration=0, so the check becomes 10/0 = Infinity > 0.5 and the weak bigram is incorrectly promoted. Fix: build an originalDurations snapshot before the Step 3 loop and use it for all threshold comparisons; mutations to entry.duration still happen (for accurate display) but no longer corrupt subsequent checks. Also adds a regression test that fails on the unfixed code. * fix(categorization): replace non-null assertions with explicit undefined guards * style(categorization): fix prettier line-length warning in guard clause
1 parent 0bafab0 commit 5529031

3 files changed

Lines changed: 223 additions & 20 deletions

File tree

src/util/categorization.ts

Lines changed: 97 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,97 @@
1+
import { IEvent } from '~/util/interfaces';
2+
3+
// Regex used to split event titles into words
4+
const SPLIT_REGEX = /[\s\-,:()[\]/]/;
5+
6+
export interface WordEntry {
7+
word: string;
8+
duration: number;
9+
events: IEvent[];
10+
}
11+
12+
/**
13+
* Finds common words and bigrams (two-word phrases) in event titles,
14+
* weighted by time duration rather than event count.
15+
*
16+
* For each bigram that accounts for >50% of the total duration of both
17+
* constituent words, the bigram is promoted and the constituent words'
18+
* durations are reduced accordingly. This means "Mozilla Firefox" appears
19+
* instead of separate "Mozilla" and "Firefox" entries when they almost
20+
* always co-occur.
21+
*
22+
* Words with length <= 2 or in `ignored_words` are skipped.
23+
*/
24+
export function findCommonPhrases(
25+
events: IEvent[],
26+
ignored_words: string[]
27+
): Map<string, WordEntry> {
28+
const words = new Map<string, WordEntry>();
29+
const bigrams = new Map<string, { bigram: string; duration: number; events: IEvent[] }>();
30+
31+
// Step 1: Build word duration dictionary
32+
for (const event of events) {
33+
for (const word of event.data.title.split(SPLIT_REGEX)) {
34+
if (word.length <= 2 || ignored_words.includes(word)) {
35+
continue;
36+
}
37+
const entry = words.get(word);
38+
if (entry) {
39+
entry.duration += event.duration;
40+
entry.events.push(event);
41+
} else {
42+
words.set(word, { word, duration: event.duration, events: [event] });
43+
}
44+
}
45+
}
46+
47+
// Step 2: Build bigram duration dictionary (skip bigrams with filtered words)
48+
for (const event of events) {
49+
const parts = event.data.title.split(SPLIT_REGEX);
50+
for (let i = 0; i < parts.length - 1; i++) {
51+
const w1 = parts[i];
52+
const w2 = parts[i + 1];
53+
if (w1.length <= 2 || ignored_words.includes(w1)) continue;
54+
if (w2.length <= 2 || ignored_words.includes(w2)) continue;
55+
const bigram = `${w1} ${w2}`;
56+
const entry = bigrams.get(bigram);
57+
if (entry) {
58+
entry.duration += event.duration;
59+
entry.events.push(event);
60+
} else {
61+
bigrams.set(bigram, { bigram, duration: event.duration, events: [event] });
62+
}
63+
}
64+
}
65+
66+
// Step 3: Promote bigrams that dominate both constituent words (>50% threshold)
67+
// Snapshot original durations before the loop so that promoting one bigram
68+
// (which reduces its constituent words' durations) does not corrupt the
69+
// threshold check for a later bigram that shares a common word. Without this,
70+
// a trigram such as "Alpha Beta Gamma" causes Beta.duration to reach 0 after
71+
// "Alpha Beta" is promoted, making 10/0 = Infinity pass the check for "Beta Gamma"
72+
// even though only 10/110 ≈ 9% of Beta's original time was spent in that bigram.
73+
const originalDurations = new Map<string, number>(
74+
Array.from(words.entries(), ([w, e]) => [w, e.duration])
75+
);
76+
for (const [bigram, bigramEntry] of bigrams) {
77+
const spaceIdx = bigram.indexOf(' ');
78+
const word1 = bigram.slice(0, spaceIdx);
79+
const word2 = bigram.slice(spaceIdx + 1);
80+
const w1Entry = words.get(word1);
81+
const w2Entry = words.get(word2);
82+
const w1OrigDuration = originalDurations.get(word1);
83+
const w2OrigDuration = originalDurations.get(word2);
84+
if (!w1Entry || !w2Entry || w1OrigDuration === undefined || w2OrigDuration === undefined)
85+
continue;
86+
87+
const bigram_duration = bigramEntry.duration;
88+
if (bigram_duration / w1OrigDuration > 0.5 && bigram_duration / w2OrigDuration > 0.5) {
89+
// Promote bigram, reduce constituent word durations
90+
words.set(bigram, { word: bigram, duration: bigram_duration, events: bigramEntry.events });
91+
w1Entry.duration -= bigram_duration;
92+
w2Entry.duration -= bigram_duration;
93+
}
94+
}
95+
96+
return words;
97+
}

src/views/settings/CategoryBuilder.vue

Lines changed: 2 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -113,6 +113,7 @@ import { canonicalEvents } from '~/queries';
113113
import { getClient } from '~/util/awclient';
114114
import CategoryEditModal from '~/components/CategoryEditModal.vue';
115115
import { isRegexBroad, validateRegex } from '~/util/validate';
116+
import { findCommonPhrases } from '~/util/categorization';
116117
117118
export default {
118119
name: 'aw-category-builder',
@@ -224,26 +225,7 @@ export default {
224225
);
225226
226227
const events = data[0];
227-
const words = new Map<string, { word: string; duration: number; events: any[] }>();
228-
for (const event of events) {
229-
const words_in_event = event.data.title.split(/[\s\-,:()[\]/]/);
230-
for (const word of words_in_event) {
231-
if (word.length <= 2 || this.ignored_words.includes(word)) {
232-
continue;
233-
}
234-
if (words.has(word)) {
235-
words.get(word).duration += event.duration;
236-
words.get(word).events.push(event);
237-
} else {
238-
words.set(word, {
239-
word: word,
240-
duration: event.duration,
241-
events: [event],
242-
});
243-
}
244-
}
245-
}
246-
this.words = words;
228+
this.words = findCommonPhrases(events, this.ignored_words);
247229
this.loading = false;
248230
},
249231
showEvents(word) {
Lines changed: 124 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,124 @@
1+
import { findCommonPhrases } from '~/util/categorization';
2+
import { IEvent } from '~/util/interfaces';
3+
4+
function makeEvent(title: string, duration: number): IEvent {
5+
return {
6+
timestamp: new Date().toISOString(),
7+
duration,
8+
data: { title },
9+
};
10+
}
11+
12+
describe('findCommonPhrases', () => {
13+
test('returns empty map for empty events', () => {
14+
expect(findCommonPhrases([], [])).toEqual(new Map());
15+
});
16+
17+
test('counts single words by duration', () => {
18+
// Single-word titles produce no bigrams, so durations accumulate directly
19+
const events = [makeEvent('hello', 100), makeEvent('hello', 50), makeEvent('world', 80)];
20+
const result = findCommonPhrases(events, []);
21+
expect(result.get('hello')?.duration).toBeCloseTo(150);
22+
expect(result.get('world')?.duration).toBeCloseTo(80);
23+
});
24+
25+
test('promotes bigram when it dominates both constituent words', () => {
26+
// "Mozilla Firefox" always appears together
27+
const events = [makeEvent('Mozilla Firefox', 100), makeEvent('Mozilla Firefox', 100)];
28+
const result = findCommonPhrases(events, []);
29+
// Bigram should be promoted
30+
expect(result.get('Mozilla Firefox')).toBeDefined();
31+
expect(result.get('Mozilla Firefox')?.duration).toBe(200);
32+
// Constituent word durations reduced to 0
33+
expect(result.get('Mozilla')?.duration).toBe(0);
34+
expect(result.get('Firefox')?.duration).toBe(0);
35+
});
36+
37+
test('does not promote bigram when one word appears independently too often', () => {
38+
const events = [
39+
makeEvent('Mozilla Firefox', 60),
40+
makeEvent('Mozilla Browser', 100), // "Mozilla" has independent time
41+
];
42+
// Mozilla total: 160, Firefox: 60, bigram "Mozilla Firefox": 60
43+
// 60/160 = 0.375 < 0.5 → bigram NOT promoted
44+
const result = findCommonPhrases(events, []);
45+
expect(result.get('Mozilla Firefox')).toBeUndefined();
46+
expect(result.get('Mozilla')).toBeDefined();
47+
expect(result.get('Firefox')).toBeDefined();
48+
});
49+
50+
test('filters out words with length <= 2', () => {
51+
const events = [makeEvent('is at go home', 100)];
52+
const result = findCommonPhrases(events, []);
53+
expect(result.get('is')).toBeUndefined();
54+
expect(result.get('at')).toBeUndefined();
55+
expect(result.get('go')).toBeUndefined();
56+
expect(result.get('home')).toBeDefined();
57+
});
58+
59+
test('filters out ignored words', () => {
60+
const events = [makeEvent('GitHub Chrome Test', 100)];
61+
const result = findCommonPhrases(events, ['GitHub', 'Chrome']);
62+
expect(result.get('GitHub')).toBeUndefined();
63+
expect(result.get('Chrome')).toBeUndefined();
64+
expect(result.get('Test')).toBeDefined();
65+
});
66+
67+
test('ignored words are not used as bigram components', () => {
68+
const events = [makeEvent('GitHub Desktop', 100), makeEvent('GitHub Desktop', 100)];
69+
const result = findCommonPhrases(events, ['GitHub']);
70+
// "GitHub" is ignored, so "GitHub Desktop" bigram should not be promoted
71+
expect(result.get('GitHub Desktop')).toBeUndefined();
72+
expect(result.get('Desktop')).toBeDefined();
73+
});
74+
75+
test('handles titles split by various separator characters', () => {
76+
const events = [makeEvent('foo-bar,baz:qux(quux)', 100)];
77+
const result = findCommonPhrases(events, []);
78+
expect(result.get('foo')).toBeDefined();
79+
expect(result.get('bar')).toBeDefined();
80+
expect(result.get('baz')).toBeDefined();
81+
expect(result.get('qux')).toBeDefined();
82+
expect(result.get('quux')).toBeDefined();
83+
});
84+
85+
test('accumulated duration across multiple events', () => {
86+
const events = [
87+
makeEvent('Python Programming', 30),
88+
makeEvent('Python Programming', 30),
89+
makeEvent('Python Programming', 30),
90+
makeEvent('Python Programming', 30),
91+
];
92+
const result = findCommonPhrases(events, []);
93+
// All events have same title → bigram fully dominates
94+
expect(result.get('Python Programming')?.duration).toBe(120);
95+
expect(result.get('Python')?.duration).toBe(0);
96+
expect(result.get('Programming')?.duration).toBe(0);
97+
});
98+
99+
test('returns Map with word entries containing events list', () => {
100+
const events = [makeEvent('Hello World', 100)];
101+
const result = findCommonPhrases(events, []);
102+
const entry = result.get('Hello');
103+
expect(entry).toBeDefined();
104+
expect(entry?.word).toBe('Hello');
105+
expect(entry?.events).toHaveLength(1);
106+
expect(entry?.events[0]).toBe(events[0]);
107+
});
108+
109+
test('trigram: does not double-promote when middle word duration is consumed', () => {
110+
// "Alpha Beta" dominates (110s); "Alpha Beta Gamma" appears rarely (10s).
111+
// Word totals: Alpha=110, Beta=110, Gamma=10
112+
// Bigram totals: "Alpha Beta"=110, "Beta Gamma"=10
113+
// "Alpha Beta" correctly promotes (110/110 > 0.5 for both words).
114+
// After promotion, Beta.duration drops to 0. Without snapshotting original
115+
// durations, "Beta Gamma" sees 10/0 = Infinity and incorrectly promotes too.
116+
// With the fix, the check uses the original Beta=110: 10/110 ≈ 0.09 < 0.5 → no promotion.
117+
const events = [makeEvent('Alpha Beta', 100), makeEvent('Alpha Beta Gamma', 10)];
118+
const result = findCommonPhrases(events, []);
119+
expect(result.get('Alpha Beta')).toBeDefined(); // correctly promoted
120+
expect(result.get('Beta Gamma')).toBeUndefined(); // must NOT be promoted
121+
const betaEntry = result.get('Beta');
122+
expect(betaEntry?.duration).toBeGreaterThanOrEqual(0); // no negative durations
123+
});
124+
});

0 commit comments

Comments
 (0)