Skip to content

Commit c31f78e

Browse files
committed
🐛 fix(triggers): retry failed batch sends, audit delivery failures, fix counter + suppression (#282)
Pure batch mode treated a failed notification send as terminal: it dropped the containers, didn't increment trigger metrics, didn't audit the failure, and suppressed repeated errors under an unknown watcher signature. - batchRetryBuffer retains failed containers and revalidates against current store state before retrying on the next watcher cycle - recordBatchDeliveryFailure writes notification-delivery-failed audit entries so failures are visible regardless of log level - incrementTriggerCounter now called in finally block for both success and failure (parity with simple mode and digest flush) - Error suppression keys off first container's watcher instead of undefined, preventing cross-watcher suppression - Retry buffer only active in pure batch mode — batch+digest uses the digest buffer as fallback instead
1 parent a4e146c commit c31f78e

File tree

3 files changed

+298
-24
lines changed

3 files changed

+298
-24
lines changed

app/model/audit.d.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ export interface AuditEntry {
55
| 'update-available'
66
| 'update-applied'
77
| 'update-failed'
8+
| 'notification-delivery-failed'
89
| 'container-update'
910
| 'security-alert'
1011
| 'agent-disconnect'

app/triggers/providers/Trigger.test.ts

Lines changed: 180 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@ import mockCron from 'node-cron';
33
import * as configuration from '../../configuration/index.js';
44
import * as event from '../../event/index.js';
55
import log from '../../log/index.js';
6+
import * as auditStore from '../../store/audit.js';
67
import * as storeContainer from '../../store/container.js';
78
import * as notificationStore from '../../store/notification.js';
89
import Trigger, {
@@ -11,9 +12,14 @@ import Trigger, {
1112
resolveNotificationTemplate,
1213
} from './Trigger.js';
1314

15+
const mockTriggerCounterInc = vi.hoisted(() => vi.fn());
16+
1417
vi.mock('node-cron');
1518
vi.mock('../../log');
1619
vi.mock('../../event');
20+
vi.mock('../../store/audit.js', () => ({
21+
insertAudit: vi.fn(),
22+
}));
1723
vi.mock('../../store/notification.js', () => ({
1824
isTriggerEnabledForRule: vi.fn(() => true),
1925
getTriggerDispatchDecisionForRule: vi.fn(() => ({
@@ -27,7 +33,7 @@ vi.mock('../../store/container.js', () => ({
2733
}));
2834
vi.mock('../../prometheus/trigger', () => ({
2935
getTriggerCounter: () => ({
30-
inc: () => ({}),
36+
inc: mockTriggerCounterInc,
3137
}),
3238
}));
3339

@@ -51,6 +57,7 @@ const configurationValid = {
5157

5258
beforeEach(async () => {
5359
vi.resetAllMocks();
60+
mockTriggerCounterInc.mockReset();
5461
notificationStore.isTriggerEnabledForRule.mockReturnValue(true);
5562
notificationStore.getTriggerDispatchDecisionForRule.mockReturnValue({
5663
enabled: true,
@@ -3073,6 +3080,138 @@ test('handleContainerReports should warn when triggerBatch fails', async () => {
30733080
expect(spyLog).toHaveBeenCalledWith('Error (batch fail)');
30743081
});
30753082

3083+
test('handleContainerReports should retain failed batch deliveries for retry on later watcher cycles', async () => {
3084+
trigger.configuration = {
3085+
threshold: 'all',
3086+
once: true,
3087+
mode: 'batch',
3088+
};
3089+
trigger.triggerBatch = vi
3090+
.fn()
3091+
.mockRejectedValueOnce(new Error('batch fail'))
3092+
.mockResolvedValueOnce(undefined);
3093+
await trigger.init();
3094+
3095+
const container = {
3096+
name: 'c1',
3097+
watcher: 'local',
3098+
updateAvailable: true,
3099+
updateKind: { kind: 'tag', semverDiff: 'major' },
3100+
};
3101+
storeContainer.getContainersRaw.mockReturnValue([container]);
3102+
3103+
await trigger.handleContainerReports([
3104+
{
3105+
changed: true,
3106+
container,
3107+
},
3108+
]);
3109+
3110+
await trigger.handleContainerReports([
3111+
{
3112+
changed: false,
3113+
container,
3114+
},
3115+
]);
3116+
3117+
expect(trigger.triggerBatch).toHaveBeenCalledTimes(2);
3118+
expect(trigger.triggerBatch).toHaveBeenLastCalledWith([container]);
3119+
});
3120+
3121+
test('handleContainerReports should increment trigger counter when batch send succeeds', async () => {
3122+
trigger.type = 'smtp';
3123+
trigger.name = 'gmail';
3124+
trigger.configuration = {
3125+
threshold: 'all',
3126+
once: true,
3127+
mode: 'batch',
3128+
};
3129+
trigger.triggerBatch = vi.fn().mockResolvedValue(undefined);
3130+
3131+
await trigger.handleContainerReports([
3132+
{
3133+
changed: true,
3134+
container: {
3135+
name: 'c1',
3136+
watcher: 'local',
3137+
updateAvailable: true,
3138+
updateKind: { kind: 'tag', semverDiff: 'major' },
3139+
},
3140+
},
3141+
]);
3142+
3143+
expect(mockTriggerCounterInc).toHaveBeenCalledWith({
3144+
type: 'smtp',
3145+
name: 'gmail',
3146+
status: 'success',
3147+
});
3148+
});
3149+
3150+
test('handleContainerReports should increment trigger counter when batch send fails', async () => {
3151+
trigger.type = 'smtp';
3152+
trigger.name = 'gmail';
3153+
trigger.configuration = {
3154+
threshold: 'all',
3155+
once: true,
3156+
mode: 'batch',
3157+
};
3158+
trigger.triggerBatch = vi.fn().mockRejectedValue(new Error('batch fail'));
3159+
3160+
await trigger.handleContainerReports([
3161+
{
3162+
changed: true,
3163+
container: {
3164+
name: 'c1',
3165+
watcher: 'local',
3166+
updateAvailable: true,
3167+
updateKind: { kind: 'tag', semverDiff: 'major' },
3168+
},
3169+
},
3170+
]);
3171+
3172+
expect(mockTriggerCounterInc).toHaveBeenCalledWith({
3173+
type: 'smtp',
3174+
name: 'gmail',
3175+
status: 'error',
3176+
});
3177+
});
3178+
3179+
test('handleContainerReports should audit failed batch deliveries', async () => {
3180+
trigger.type = 'smtp';
3181+
trigger.name = 'gmail';
3182+
trigger.configuration = {
3183+
threshold: 'all',
3184+
once: true,
3185+
mode: 'batch',
3186+
};
3187+
trigger.triggerBatch = vi.fn().mockRejectedValue(new Error('SMTP timeout'));
3188+
3189+
await trigger.handleContainerReports([
3190+
{
3191+
changed: true,
3192+
container: {
3193+
name: 'c1',
3194+
watcher: 'local',
3195+
image: {
3196+
name: 'library/nginx',
3197+
},
3198+
updateAvailable: true,
3199+
updateKind: { kind: 'tag', localValue: '1.0.0', remoteValue: '2.0.0', semverDiff: 'major' },
3200+
},
3201+
},
3202+
]);
3203+
3204+
expect(auditStore.insertAudit).toHaveBeenCalledWith(
3205+
expect.objectContaining({
3206+
action: 'notification-delivery-failed',
3207+
containerName: 'local_c1',
3208+
triggerName: 'smtp.gmail',
3209+
status: 'error',
3210+
details: 'SMTP timeout',
3211+
}),
3212+
);
3213+
});
3214+
30763215
test('handleContainerReports should suppress repeated identical batch errors during a short burst', async () => {
30773216
trigger.configuration = {
30783217
threshold: 'all',
@@ -3105,6 +3244,46 @@ test('handleContainerReports should suppress repeated identical batch errors dur
31053244
expect(debugSpy).toHaveBeenCalledWith('Suppressed repeated error (batch fail)');
31063245
});
31073246

3247+
test('handleContainerReports should not suppress identical batch errors across different watchers', async () => {
3248+
trigger.configuration = {
3249+
threshold: 'all',
3250+
mode: 'batch',
3251+
};
3252+
trigger.triggerBatch = vi.fn().mockRejectedValue(new Error('batch fail'));
3253+
await trigger.init();
3254+
const warnSpy = vi.spyOn(log, 'warn');
3255+
const debugSpy = vi.spyOn(log, 'debug');
3256+
let now = 1_000;
3257+
vi.spyOn(Date, 'now').mockImplementation(() => now);
3258+
3259+
await trigger.handleContainerReports([
3260+
{
3261+
changed: true,
3262+
container: {
3263+
name: 'c1',
3264+
watcher: 'local',
3265+
updateAvailable: true,
3266+
updateKind: { kind: 'tag', semverDiff: 'major' },
3267+
},
3268+
},
3269+
]);
3270+
now = 1_500;
3271+
await trigger.handleContainerReports([
3272+
{
3273+
changed: true,
3274+
container: {
3275+
name: 'c1',
3276+
watcher: 'servicevault',
3277+
updateAvailable: true,
3278+
updateKind: { kind: 'tag', semverDiff: 'major' },
3279+
},
3280+
},
3281+
]);
3282+
3283+
expect(warnSpy).toHaveBeenCalledTimes(2);
3284+
expect(debugSpy).not.toHaveBeenCalledWith('Suppressed repeated error (batch fail)');
3285+
});
3286+
31083287
test('flushEventBatchDispatch should warn when auto event batch dispatch fails', async () => {
31093288
trigger.configuration = {
31103289
threshold: 'all',

0 commit comments

Comments
 (0)