diff --git a/CHANGELOG.md b/CHANGELOG.md index 156403c..5a7f171 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,19 @@ # Changelog +## [4.3.0] - 2024-02-15 + +### Added + +- Add `RealtimeTranscriber.configureEndUtteranceSilenceThreshold` function +- Add `RealtimeTranscriber.forceEndUtterance` function +- Add `end_utterance_silence_threshold` property to `CreateRealtimeTranscriberParams` and `RealtimeTranscriberParams` types. + +## [4.2.3] - 2024-02-13 + +### Added + +- Add `speech_model` field to `TranscriptParams` and add `SpeechModel` type. + ## [4.2.2] - 2024-01-29 ### Changed diff --git a/package.json b/package.json index a4e1758..a16b054 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "assemblyai", - "version": "4.2.3", + "version": "4.3.0", "description": "The AssemblyAI JavaScript SDK provides an easy-to-use interface for interacting with the AssemblyAI API, which supports async and real-time transcription, as well as the latest LeMUR models.", "engines": { "node": ">=18" diff --git a/scripts/kitchensink.ts b/scripts/kitchensink.ts index 3e128e3..5e751dc 100644 --- a/scripts/kitchensink.ts +++ b/scripts/kitchensink.ts @@ -26,6 +26,7 @@ const client = new AssemblyAI({ }) : undefined, encoding: "pcm_s16le", + end_utterance_silence_threshold: 500, }; const rt = client.realtime.transcriber(serviceParams); diff --git a/src/services/realtime/service.ts b/src/services/realtime/service.ts index bc7ca09..871e572 100644 --- a/src/services/realtime/service.ts +++ b/src/services/realtime/service.ts @@ -20,6 +20,26 @@ import { } from "../../utils/errors"; const defaultRealtimeUrl = "wss://api.assemblyai.com/v2/realtime/ws"; +const forceEndOfUtteranceMessage = `{"force_end_utterance":true}`; +const terminateSessionMessage = `{"terminate_session":true}`; + +type BufferLike = + | string + | Buffer + | DataView + | number + | ArrayBufferView + | Uint8Array + | ArrayBuffer + | SharedArrayBuffer + | ReadonlyArray + | ReadonlyArray + | { valueOf(): ArrayBuffer } + | { valueOf(): SharedArrayBuffer } + | { valueOf(): Uint8Array } + | { valueOf(): ReadonlyArray } + | { valueOf(): string } + | { [Symbol.toPrimitive](hint: string): string }; export class RealtimeTranscriber { private realtimeUrl: string; @@ -28,6 +48,7 @@ export class RealtimeTranscriber { private encoding?: AudioEncoding; private apiKey?: string; private token?: string; + private end_utterance_silence_threshold?: number; private socket?: WebSocket; private listeners: RealtimeListeners = {}; private sessionTerminatedResolve?: () => void; @@ -37,6 +58,8 @@ export class RealtimeTranscriber { this.sampleRate = params.sampleRate ?? 16_000; this.wordBoost = params.wordBoost; this.encoding = params.encoding; + this.end_utterance_silence_threshold = + params.end_utterance_silence_threshold; if ("token" in params && params.token) this.token = params.token; if ("apiKey" in params && params.apiKey) this.apiKey = params.apiKey; @@ -105,6 +128,18 @@ export class RealtimeTranscriber { } this.socket.binaryType = "arraybuffer"; + this.socket.onopen = () => { + if ( + this.end_utterance_silence_threshold === undefined || + this.end_utterance_silence_threshold === null + ) { + return; + } + this.configureEndUtteranceSilenceThreshold( + this.end_utterance_silence_threshold + ); + }; + this.socket.onclose = ({ code, reason }: CloseEvent) => { if (!reason) { if (code in RealtimeErrorType) { @@ -159,10 +194,7 @@ export class RealtimeTranscriber { } sendAudio(audio: AudioData) { - if (!this.socket || this.socket.readyState !== WebSocket.OPEN) { - throw new Error("Socket is not open for communication"); - } - this.socket.send(audio); + this.send(audio); } stream(): WritableStream { @@ -173,10 +205,32 @@ export class RealtimeTranscriber { }); } + /** + * Manually end an utterance + */ + forceEndUtterance() { + this.send(forceEndOfUtteranceMessage); + } + + /** + * Configure the threshold for how long to wait before ending an utterance. Default is 700ms. + * @param threshold The duration of the end utterance silence threshold in milliseconds + * @format integer + */ + configureEndUtteranceSilenceThreshold(threshold: number) { + this.send(`{"end_utterance_silence_threshold":${threshold}}`); + } + + private send(data: BufferLike) { + if (!this.socket || this.socket.readyState !== WebSocket.OPEN) { + throw new Error("Socket is not open for communication"); + } + this.socket.send(data); + } + async close(waitForSessionTermination = true) { if (this.socket) { if (this.socket.readyState === WebSocket.OPEN) { - const terminateSessionMessage = `{"terminate_session": true}`; if (waitForSessionTermination) { const sessionTerminatedPromise = new Promise((resolve) => { this.sessionTerminatedResolve = resolve; diff --git a/src/types/asyncapi.generated.ts b/src/types/asyncapi.generated.ts index fba7dd1..2586d9c 100644 --- a/src/types/asyncapi.generated.ts +++ b/src/types/asyncapi.generated.ts @@ -28,6 +28,12 @@ export type AudioData = ArrayBufferLike; */ export type AudioEncoding = "pcm_s16le" | "pcm_mulaw"; +/** @description Configure the threshold for how long to wait before ending an utterance. Default is 700ms. */ +export type ConfigureEndUtteranceSilenceThreshold = { + /** @description The duration threshold in milliseconds */ + end_utterance_silence_threshold: number; +}; + export type FinalTranscript = RealtimeBaseTranscript & { /** * @description Describes the type of message @@ -40,6 +46,12 @@ export type FinalTranscript = RealtimeBaseTranscript & { text_formatted: boolean; }; +/** @description Manually end an utterance */ +export type ForceEndUtterance = { + /** @description A boolean value to communicate that you wish to force the end of the utterance */ + force_end_utterance: boolean; +}; + /** @enum {string} */ export type MessageType = | "SessionBegins" diff --git a/src/types/realtime/index.ts b/src/types/realtime/index.ts index 54e76f6..025c8d9 100644 --- a/src/types/realtime/index.ts +++ b/src/types/realtime/index.ts @@ -7,15 +7,38 @@ import { } from "../asyncapi.generated"; type CreateRealtimeTranscriberParams = { + /** + * The WebSocket URL that the RealtimeTranscriber connects to + */ realtimeUrl?: string; + /** + * The sample rate of the streamed audio + */ sampleRate?: number; + /** + * Add up to 2500 characters of custom vocabulary + */ wordBoost?: string[]; + /** + * The encoding of the audio data + */ encoding?: AudioEncoding; + /** + * The duration of the end utterance silence threshold in milliseconds + */ + end_utterance_silence_threshold?: number; } & ( | { + /** + * The API key used to authenticate the RealtimeTranscriber + * Using an API key to authenticate the RealtimeTranscriber is not supported in the browser. + */ apiKey?: string; } | { + /** + * The temporary token used to authenticate the RealtimeTranscriber + */ token: string; } ); @@ -26,15 +49,38 @@ type CreateRealtimeTranscriberParams = { type CreateRealtimeServiceParams = CreateRealtimeTranscriberParams; type RealtimeTranscriberParams = { + /** + * The WebSocket URL that the RealtimeTranscriber connects to + */ realtimeUrl?: string; + /** + * The sample rate of the streamed audio + */ sampleRate?: number; + /** + * Add up to 2500 characters of custom vocabulary + */ wordBoost?: string[]; + /** + * The encoding of the audio data + */ encoding?: AudioEncoding; + /** + * The duration of the end utterance silence threshold in milliseconds + */ + end_utterance_silence_threshold?: number; } & ( | { + /** + * The API key used to authenticate the RealtimeTranscriber. + * Using an API key to authenticate the RealtimeTranscriber is not supported in the browser. + */ apiKey: string; } | { + /** + * The temporary token used to authenticate the RealtimeTranscriber + */ token: string; } ); diff --git a/tests/realtime.test.ts b/tests/realtime.test.ts index d44cfa0..2840224 100644 --- a/tests/realtime.test.ts +++ b/tests/realtime.test.ts @@ -57,6 +57,12 @@ describe("realtime", () => { WS.clean(); } + it("fails without API key and token", async () => { + expect(() => new RealtimeTranscriber({ apiKey: "" })).toThrowError( + "API key or temporary token is required." + ); + }); + it("fails on redundant connection", async () => { await expect(async () => await rt.connect()).rejects.toThrowError( "Already connected" @@ -149,6 +155,34 @@ describe("realtime", () => { await expect(server).toReceiveMessage(data); }); + it("creates service with EndUtteranceSilenceThreshold", async () => { + const realtimeUrl = "wss://localhost:5678"; + const server = new WS(realtimeUrl); + const aai = createClient(); + const rt = aai.realtime.transcriber({ + realtimeUrl, + apiKey: "123", + end_utterance_silence_threshold: 500, + }); + await connect(rt, server); + await expect(server).toReceiveMessage( + `{"end_utterance_silence_threshold":500}` + ); + await close(rt, server); + }); + + it("can set EndUtteranceSilenceThreshold", async () => { + rt.configureEndUtteranceSilenceThreshold(500); + await expect(server).toReceiveMessage( + `{"end_utterance_silence_threshold":500}` + ); + }); + + it("can set forceEndUtterance", async () => { + rt.forceEndUtterance(); + await expect(server).toReceiveMessage(`{"force_end_utterance":true}`); + }); + it("can receive transcript", () => { const data = { created: "2023-09-14T03:37:11.516967",