Skip to content

Commit

Permalink
Merge pull request #35 from AssemblyAI/E07417BDFEA3614F5967B1520F8B2F61
Browse files Browse the repository at this point in the history
Sync from internal repo (2024/02/15)
  • Loading branch information
Swimburger committed Feb 15, 2024
2 parents 24d94a8 + bfb0089 commit 5705bfe
Show file tree
Hide file tree
Showing 7 changed files with 167 additions and 6 deletions.
14 changes: 14 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,19 @@
# Changelog

## [4.3.0] - 2024-02-15

### Added

- Add `RealtimeTranscriber.configureEndUtteranceSilenceThreshold` function
- Add `RealtimeTranscriber.forceEndUtterance` function
- Add `end_utterance_silence_threshold` property to `CreateRealtimeTranscriberParams` and `RealtimeTranscriberParams` types.

## [4.2.3] - 2024-02-13

### Added

- Add `speech_model` field to `TranscriptParams` and add `SpeechModel` type.

## [4.2.2] - 2024-01-29

### Changed
Expand Down
2 changes: 1 addition & 1 deletion package.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"name": "assemblyai",
"version": "4.2.3",
"version": "4.3.0",
"description": "The AssemblyAI JavaScript SDK provides an easy-to-use interface for interacting with the AssemblyAI API, which supports async and real-time transcription, as well as the latest LeMUR models.",
"engines": {
"node": ">=18"
Expand Down
1 change: 1 addition & 0 deletions scripts/kitchensink.ts
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ const client = new AssemblyAI({
})
: undefined,
encoding: "pcm_s16le",
end_utterance_silence_threshold: 500,
};
const rt = client.realtime.transcriber(serviceParams);

Expand Down
64 changes: 59 additions & 5 deletions src/services/realtime/service.ts
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,26 @@ import {
} from "../../utils/errors";

const defaultRealtimeUrl = "wss://api.assemblyai.com/v2/realtime/ws";
const forceEndOfUtteranceMessage = `{"force_end_utterance":true}`;
const terminateSessionMessage = `{"terminate_session":true}`;

type BufferLike =
| string
| Buffer
| DataView
| number
| ArrayBufferView
| Uint8Array
| ArrayBuffer
| SharedArrayBuffer
| ReadonlyArray<unknown>
| ReadonlyArray<number>
| { valueOf(): ArrayBuffer }
| { valueOf(): SharedArrayBuffer }
| { valueOf(): Uint8Array }
| { valueOf(): ReadonlyArray<number> }
| { valueOf(): string }
| { [Symbol.toPrimitive](hint: string): string };

export class RealtimeTranscriber {
private realtimeUrl: string;
Expand All @@ -28,6 +48,7 @@ export class RealtimeTranscriber {
private encoding?: AudioEncoding;
private apiKey?: string;
private token?: string;
private end_utterance_silence_threshold?: number;
private socket?: WebSocket;
private listeners: RealtimeListeners = {};
private sessionTerminatedResolve?: () => void;
Expand All @@ -37,6 +58,8 @@ export class RealtimeTranscriber {
this.sampleRate = params.sampleRate ?? 16_000;
this.wordBoost = params.wordBoost;
this.encoding = params.encoding;
this.end_utterance_silence_threshold =
params.end_utterance_silence_threshold;
if ("token" in params && params.token) this.token = params.token;
if ("apiKey" in params && params.apiKey) this.apiKey = params.apiKey;

Expand Down Expand Up @@ -105,6 +128,18 @@ export class RealtimeTranscriber {
}
this.socket.binaryType = "arraybuffer";

this.socket.onopen = () => {
if (
this.end_utterance_silence_threshold === undefined ||
this.end_utterance_silence_threshold === null
) {
return;
}
this.configureEndUtteranceSilenceThreshold(
this.end_utterance_silence_threshold
);
};

this.socket.onclose = ({ code, reason }: CloseEvent) => {
if (!reason) {
if (code in RealtimeErrorType) {
Expand Down Expand Up @@ -159,10 +194,7 @@ export class RealtimeTranscriber {
}

sendAudio(audio: AudioData) {
if (!this.socket || this.socket.readyState !== WebSocket.OPEN) {
throw new Error("Socket is not open for communication");
}
this.socket.send(audio);
this.send(audio);
}

stream(): WritableStream<AudioData> {
Expand All @@ -173,10 +205,32 @@ export class RealtimeTranscriber {
});
}

/**
* Manually end an utterance
*/
forceEndUtterance() {
this.send(forceEndOfUtteranceMessage);
}

/**
* Configure the threshold for how long to wait before ending an utterance. Default is 700ms.
* @param threshold The duration of the end utterance silence threshold in milliseconds
* @format integer
*/
configureEndUtteranceSilenceThreshold(threshold: number) {
this.send(`{"end_utterance_silence_threshold":${threshold}}`);
}

private send(data: BufferLike) {
if (!this.socket || this.socket.readyState !== WebSocket.OPEN) {
throw new Error("Socket is not open for communication");
}
this.socket.send(data);
}

async close(waitForSessionTermination = true) {
if (this.socket) {
if (this.socket.readyState === WebSocket.OPEN) {
const terminateSessionMessage = `{"terminate_session": true}`;
if (waitForSessionTermination) {
const sessionTerminatedPromise = new Promise<void>((resolve) => {
this.sessionTerminatedResolve = resolve;
Expand Down
12 changes: 12 additions & 0 deletions src/types/asyncapi.generated.ts
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,12 @@ export type AudioData = ArrayBufferLike;
*/
export type AudioEncoding = "pcm_s16le" | "pcm_mulaw";

/** @description Configure the threshold for how long to wait before ending an utterance. Default is 700ms. */
export type ConfigureEndUtteranceSilenceThreshold = {
/** @description The duration threshold in milliseconds */
end_utterance_silence_threshold: number;
};

export type FinalTranscript = RealtimeBaseTranscript & {
/**
* @description Describes the type of message
Expand All @@ -40,6 +46,12 @@ export type FinalTranscript = RealtimeBaseTranscript & {
text_formatted: boolean;
};

/** @description Manually end an utterance */
export type ForceEndUtterance = {
/** @description A boolean value to communicate that you wish to force the end of the utterance */
force_end_utterance: boolean;
};

/** @enum {string} */
export type MessageType =
| "SessionBegins"
Expand Down
46 changes: 46 additions & 0 deletions src/types/realtime/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -7,15 +7,38 @@ import {
} from "../asyncapi.generated";

type CreateRealtimeTranscriberParams = {
/**
* The WebSocket URL that the RealtimeTranscriber connects to
*/
realtimeUrl?: string;
/**
* The sample rate of the streamed audio
*/
sampleRate?: number;
/**
* Add up to 2500 characters of custom vocabulary
*/
wordBoost?: string[];
/**
* The encoding of the audio data
*/
encoding?: AudioEncoding;
/**
* The duration of the end utterance silence threshold in milliseconds
*/
end_utterance_silence_threshold?: number;
} & (
| {
/**
* The API key used to authenticate the RealtimeTranscriber
* Using an API key to authenticate the RealtimeTranscriber is not supported in the browser.
*/
apiKey?: string;
}
| {
/**
* The temporary token used to authenticate the RealtimeTranscriber
*/
token: string;
}
);
Expand All @@ -26,15 +49,38 @@ type CreateRealtimeTranscriberParams = {
type CreateRealtimeServiceParams = CreateRealtimeTranscriberParams;

type RealtimeTranscriberParams = {
/**
* The WebSocket URL that the RealtimeTranscriber connects to
*/
realtimeUrl?: string;
/**
* The sample rate of the streamed audio
*/
sampleRate?: number;
/**
* Add up to 2500 characters of custom vocabulary
*/
wordBoost?: string[];
/**
* The encoding of the audio data
*/
encoding?: AudioEncoding;
/**
* The duration of the end utterance silence threshold in milliseconds
*/
end_utterance_silence_threshold?: number;
} & (
| {
/**
* The API key used to authenticate the RealtimeTranscriber.
* Using an API key to authenticate the RealtimeTranscriber is not supported in the browser.
*/
apiKey: string;
}
| {
/**
* The temporary token used to authenticate the RealtimeTranscriber
*/
token: string;
}
);
Expand Down
34 changes: 34 additions & 0 deletions tests/realtime.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,12 @@ describe("realtime", () => {
WS.clean();
}

it("fails without API key and token", async () => {
expect(() => new RealtimeTranscriber({ apiKey: "" })).toThrowError(
"API key or temporary token is required."
);
});

it("fails on redundant connection", async () => {
await expect(async () => await rt.connect()).rejects.toThrowError(
"Already connected"
Expand Down Expand Up @@ -149,6 +155,34 @@ describe("realtime", () => {
await expect(server).toReceiveMessage(data);
});

it("creates service with EndUtteranceSilenceThreshold", async () => {
const realtimeUrl = "wss://localhost:5678";
const server = new WS(realtimeUrl);
const aai = createClient();
const rt = aai.realtime.transcriber({
realtimeUrl,
apiKey: "123",
end_utterance_silence_threshold: 500,
});
await connect(rt, server);
await expect(server).toReceiveMessage(
`{"end_utterance_silence_threshold":500}`
);
await close(rt, server);
});

it("can set EndUtteranceSilenceThreshold", async () => {
rt.configureEndUtteranceSilenceThreshold(500);
await expect(server).toReceiveMessage(
`{"end_utterance_silence_threshold":500}`
);
});

it("can set forceEndUtterance", async () => {
rt.forceEndUtterance();
await expect(server).toReceiveMessage(`{"force_end_utterance":true}`);
});

it("can receive transcript", () => {
const data = {
created: "2023-09-14T03:37:11.516967",
Expand Down

0 comments on commit 5705bfe

Please sign in to comment.