Merge pull request #35 from AssemblyAI/E07417BDFEA3614F5967B1520F8B2F61

Sync from internal repo (2024/02/15)
AssemblyAI · Feb 15, 2024 · 5705bfe · 5705bfe
2 parents 24d94a8 + bfb0089
commit 5705bfe
Show file tree

Hide file tree

Showing 7 changed files with 167 additions and 6 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,5 +1,19 @@
 # Changelog
 
+## [4.3.0] - 2024-02-15
+
+### Added
+
+- Add `RealtimeTranscriber.configureEndUtteranceSilenceThreshold` function
+- Add `RealtimeTranscriber.forceEndUtterance` function
+- Add `end_utterance_silence_threshold` property to `CreateRealtimeTranscriberParams` and `RealtimeTranscriberParams` types.
+
+## [4.2.3] - 2024-02-13
+
+### Added
+
+- Add `speech_model` field to `TranscriptParams` and add `SpeechModel` type.
+
 ## [4.2.2] - 2024-01-29
 
 ### Changed

diff --git a/package.json b/package.json
@@ -1,6 +1,6 @@
 {
   "name": "assemblyai",
-  "version": "4.2.3",
+  "version": "4.3.0",
   "description": "The AssemblyAI JavaScript SDK provides an easy-to-use interface for interacting with the AssemblyAI API, which supports async and real-time transcription, as well as the latest LeMUR models.",
   "engines": {
     "node": ">=18"

diff --git a/scripts/kitchensink.ts b/scripts/kitchensink.ts
@@ -26,6 +26,7 @@ const client = new AssemblyAI({
         })
       : undefined,
     encoding: "pcm_s16le",
+    end_utterance_silence_threshold: 500,
   };
   const rt = client.realtime.transcriber(serviceParams);
 

diff --git a/src/services/realtime/service.ts b/src/services/realtime/service.ts
@@ -20,6 +20,26 @@ import {
 } from "../../utils/errors";
 
 const defaultRealtimeUrl = "wss://api.assemblyai.com/v2/realtime/ws";
+const forceEndOfUtteranceMessage = `{"force_end_utterance":true}`;
+const terminateSessionMessage = `{"terminate_session":true}`;
+
+type BufferLike =
+  | string
+  | Buffer
+  | DataView
+  | number
+  | ArrayBufferView
+  | Uint8Array
+  | ArrayBuffer
+  | SharedArrayBuffer
+  | ReadonlyArray<unknown>
+  | ReadonlyArray<number>
+  | { valueOf(): ArrayBuffer }
+  | { valueOf(): SharedArrayBuffer }
+  | { valueOf(): Uint8Array }
+  | { valueOf(): ReadonlyArray<number> }
+  | { valueOf(): string }
+  | { [Symbol.toPrimitive](hint: string): string };
 
 export class RealtimeTranscriber {
   private realtimeUrl: string;
@@ -28,6 +48,7 @@ export class RealtimeTranscriber {
   private encoding?: AudioEncoding;
   private apiKey?: string;
   private token?: string;
+  private end_utterance_silence_threshold?: number;
   private socket?: WebSocket;
   private listeners: RealtimeListeners = {};
   private sessionTerminatedResolve?: () => void;
@@ -37,6 +58,8 @@ export class RealtimeTranscriber {
     this.sampleRate = params.sampleRate ?? 16_000;
     this.wordBoost = params.wordBoost;
     this.encoding = params.encoding;
+    this.end_utterance_silence_threshold =
+      params.end_utterance_silence_threshold;
     if ("token" in params && params.token) this.token = params.token;
     if ("apiKey" in params && params.apiKey) this.apiKey = params.apiKey;
 
@@ -105,6 +128,18 @@ export class RealtimeTranscriber {
       }
       this.socket.binaryType = "arraybuffer";
 
+      this.socket.onopen = () => {
+        if (
+          this.end_utterance_silence_threshold === undefined ||
+          this.end_utterance_silence_threshold === null
+        ) {
+          return;
+        }
+        this.configureEndUtteranceSilenceThreshold(
+          this.end_utterance_silence_threshold
+        );
+      };
+
       this.socket.onclose = ({ code, reason }: CloseEvent) => {
         if (!reason) {
           if (code in RealtimeErrorType) {
@@ -159,10 +194,7 @@ export class RealtimeTranscriber {
   }
 
   sendAudio(audio: AudioData) {
-    if (!this.socket || this.socket.readyState !== WebSocket.OPEN) {
-      throw new Error("Socket is not open for communication");
-    }
-    this.socket.send(audio);
+    this.send(audio);
   }
 
   stream(): WritableStream<AudioData> {
@@ -173,10 +205,32 @@ export class RealtimeTranscriber {
     });
   }
 
+  /**
+   * Manually end an utterance
+   */
+  forceEndUtterance() {
+    this.send(forceEndOfUtteranceMessage);
+  }
+
+  /**
+   * Configure the threshold for how long to wait before ending an utterance. Default is 700ms.
+   * @param threshold The duration of the end utterance silence threshold in milliseconds
+   * @format integer
+   */
+  configureEndUtteranceSilenceThreshold(threshold: number) {
+    this.send(`{"end_utterance_silence_threshold":${threshold}}`);
+  }
+
+  private send(data: BufferLike) {
+    if (!this.socket || this.socket.readyState !== WebSocket.OPEN) {
+      throw new Error("Socket is not open for communication");
+    }
+    this.socket.send(data);
+  }
+
   async close(waitForSessionTermination = true) {
     if (this.socket) {
       if (this.socket.readyState === WebSocket.OPEN) {
-        const terminateSessionMessage = `{"terminate_session": true}`;
         if (waitForSessionTermination) {
           const sessionTerminatedPromise = new Promise<void>((resolve) => {
             this.sessionTerminatedResolve = resolve;

diff --git a/src/types/asyncapi.generated.ts b/src/types/asyncapi.generated.ts
@@ -28,6 +28,12 @@ export type AudioData = ArrayBufferLike;
  */
 export type AudioEncoding = "pcm_s16le" | "pcm_mulaw";
 
+/** @description Configure the threshold for how long to wait before ending an utterance. Default is 700ms. */
+export type ConfigureEndUtteranceSilenceThreshold = {
+  /** @description The duration threshold in milliseconds */
+  end_utterance_silence_threshold: number;
+};
+
 export type FinalTranscript = RealtimeBaseTranscript & {
   /**
    * @description Describes the type of message
@@ -40,6 +46,12 @@ export type FinalTranscript = RealtimeBaseTranscript & {
   text_formatted: boolean;
 };
 
+/** @description Manually end an utterance */
+export type ForceEndUtterance = {
+  /** @description A boolean value to communicate that you wish to force the end of the utterance */
+  force_end_utterance: boolean;
+};
+
 /** @enum {string} */
 export type MessageType =
   | "SessionBegins"

diff --git a/src/types/realtime/index.ts b/src/types/realtime/index.ts
@@ -7,15 +7,38 @@ import {
 } from "../asyncapi.generated";
 
 type CreateRealtimeTranscriberParams = {
+  /**
+   * The WebSocket URL that the RealtimeTranscriber connects to
+   */
   realtimeUrl?: string;
+  /**
+   * The sample rate of the streamed audio
+   */
   sampleRate?: number;
+  /**
+   * Add up to 2500 characters of custom vocabulary
+   */
   wordBoost?: string[];
+  /**
+   * The encoding of the audio data
+   */
   encoding?: AudioEncoding;
+  /**
+   * The duration of the end utterance silence threshold in milliseconds
+   */
+  end_utterance_silence_threshold?: number;
 } & (
   | {
+      /**
+       * The API key used to authenticate the RealtimeTranscriber
+       * Using an API key to authenticate the RealtimeTranscriber is not supported in the browser.
+       */
       apiKey?: string;
     }
   | {
+      /**
+       * The temporary token used to authenticate the RealtimeTranscriber
+       */
       token: string;
     }
 );
@@ -26,15 +49,38 @@ type CreateRealtimeTranscriberParams = {
 type CreateRealtimeServiceParams = CreateRealtimeTranscriberParams;
 
 type RealtimeTranscriberParams = {
+  /**
+   * The WebSocket URL that the RealtimeTranscriber connects to
+   */
   realtimeUrl?: string;
+  /**
+   * The sample rate of the streamed audio
+   */
   sampleRate?: number;
+  /**
+   * Add up to 2500 characters of custom vocabulary
+   */
   wordBoost?: string[];
+  /**
+   * The encoding of the audio data
+   */
   encoding?: AudioEncoding;
+  /**
+   * The duration of the end utterance silence threshold in milliseconds
+   */
+  end_utterance_silence_threshold?: number;
 } & (
   | {
+      /**
+       * The API key used to authenticate the RealtimeTranscriber.
+       * Using an API key to authenticate the RealtimeTranscriber is not supported in the browser.
+       */
       apiKey: string;
     }
   | {
+      /**
+       * The temporary token used to authenticate the RealtimeTranscriber
+       */
       token: string;
     }
 );

diff --git a/tests/realtime.test.ts b/tests/realtime.test.ts
@@ -57,6 +57,12 @@ describe("realtime", () => {
     WS.clean();
   }
 
+  it("fails without API key and token", async () => {
+    expect(() => new RealtimeTranscriber({ apiKey: "" })).toThrowError(
+      "API key or temporary token is required."
+    );
+  });
+
   it("fails on redundant connection", async () => {
     await expect(async () => await rt.connect()).rejects.toThrowError(
       "Already connected"
@@ -149,6 +155,34 @@ describe("realtime", () => {
     await expect(server).toReceiveMessage(data);
   });
 
+  it("creates service with EndUtteranceSilenceThreshold", async () => {
+    const realtimeUrl = "wss://localhost:5678";
+    const server = new WS(realtimeUrl);
+    const aai = createClient();
+    const rt = aai.realtime.transcriber({
+      realtimeUrl,
+      apiKey: "123",
+      end_utterance_silence_threshold: 500,
+    });
+    await connect(rt, server);
+    await expect(server).toReceiveMessage(
+      `{"end_utterance_silence_threshold":500}`
+    );
+    await close(rt, server);
+  });
+
+  it("can set EndUtteranceSilenceThreshold", async () => {
+    rt.configureEndUtteranceSilenceThreshold(500);
+    await expect(server).toReceiveMessage(
+      `{"end_utterance_silence_threshold":500}`
+    );
+  });
+
+  it("can set forceEndUtterance", async () => {
+    rt.forceEndUtterance();
+    await expect(server).toReceiveMessage(`{"force_end_utterance":true}`);
+  });
+
   it("can receive transcript", () => {
     const data = {
       created: "2023-09-14T03:37:11.516967",