diff --git a/fern/pages/02-speech-to-text/universal-streaming/multichannel-streaming.mdx b/fern/pages/02-speech-to-text/universal-streaming/multichannel-streaming.mdx index 0ce4f8fe..cf65015c 100644 --- a/fern/pages/02-speech-to-text/universal-streaming/multichannel-streaming.mdx +++ b/fern/pages/02-speech-to-text/universal-streaming/multichannel-streaming.mdx @@ -5,529 +5,1095 @@ To transcribe multichannel streaming audio, we recommend creating a separate ses The following code example demonstrates how to transcribe a dual-channel audio file with diarized, speaker-separated transcripts. This same approach can be applied to any multi-channel audio stream, including those with more than two channels. - - + - Firstly, install the required dependencies. - - ```bash - pip install websocket-client numpy pyaudio - ``` + Install the required dependencies. +```bash +pip install assemblyai numpy pyaudio +``` Use this complete script to transcribe dual-channel audio with speaker separation: - - ```python - import websocket - import json - import threading - import numpy as np - import wave - import time - import pyaudio - from urllib.parse import urlencode - - # Configuration - YOUR_API_KEY = "" - AUDIO_FILE_PATH = "" - API_BASE_URL = "wss://streaming.assemblyai.com/v3/ws" - API_PARAMS = { - "sample_rate": 8000, - "format_turns": "true", - } - - # Build API endpoint with URL encoding - API_ENDPOINT = f"{API_BASE_URL}?{urlencode(API_PARAMS)}" - - class ChannelTranscriber: - def __init__(self, channel_id, channel_name): - self.channel_id = channel_id - self.channel_name = channel_name - self.ws_app = None - self.audio_data = [] - self.current_turn_line = None - self.line_count = 0 - - def load_audio_channel(self): - """Extract single channel from dual-channel audio file.""" - with wave.open(AUDIO_FILE_PATH, 'rb') as wf: - frames = wf.readframes(wf.getnframes()) - audio_array = np.frombuffer(frames, dtype=np.int16) - - if wf.getnchannels() == 2: - audio_array = audio_array.reshape(-1, 2) - channel_audio = audio_array[:, self.channel_id] - - # Split into chunks for streaming - FRAMES_PER_BUFFER = 400 # 50ms chunks - for i in range(0, len(channel_audio), FRAMES_PER_BUFFER): - chunk = channel_audio[i:i+FRAMES_PER_BUFFER] - if len(chunk) < FRAMES_PER_BUFFER: - chunk = np.pad(chunk, (0, FRAMES_PER_BUFFER - len(chunk)), 'constant') - self.audio_data.append(chunk.astype(np.int16).tobytes()) - - def on_open(self, ws): - """Stream audio data when connection opens.""" - def stream_audio(): - for chunk in self.audio_data: - ws.send(chunk, websocket.ABNF.OPCODE_BINARY) - time.sleep(0.05) # 50ms intervals - - # Send termination message - terminate_message = {"type": "Terminate"} - ws.send(json.dumps(terminate_message)) - - threading.Thread(target=stream_audio, daemon=True).start() - - def clear_current_line(self): - if self.current_turn_line is not None: - print("\r" + " " * 100 + "\r", end="", flush=True) - - def print_partial_transcript(self, words): - self.clear_current_line() - # Build transcript from individual words - word_texts = [word.get('text', '') for word in words] - transcript = ' '.join(word_texts) - partial_text = f"{self.channel_name}: {transcript}" - print(partial_text, end="", flush=True) - self.current_turn_line = len(partial_text) - - def print_final_transcript(self, transcript): - self.clear_current_line() - final_text = f"{self.channel_name}: {transcript}" - print(final_text, flush=True) - self.current_turn_line = None - self.line_count += 1 - - def on_message(self, ws, message): - """Handle transcription results.""" - data = json.loads(message) - msg_type = data.get('type') - - if msg_type == "Turn": - transcript = data.get('transcript', '').strip() - formatted = data.get('turn_is_formatted', False) - words = data.get('words', []) - - if transcript or words: - if formatted: - self.print_final_transcript(transcript) - else: - self.print_partial_transcript(words) - - def start_transcription(self): - self.load_audio_channel() - - self.ws_app = websocket.WebSocketApp( - API_ENDPOINT, - header={"Authorization": YOUR_API_KEY}, - on_open=self.on_open, - on_message=self.on_message, - ) - - thread = threading.Thread(target=self.ws_app.run_forever, daemon=True) - thread.start() - return thread - - def play_audio_file(): - try: - with wave.open(AUDIO_FILE_PATH, 'rb') as wf: - p = pyaudio.PyAudio() - - stream = p.open( - format=p.get_format_from_width(wf.getsampwidth()), - channels=wf.getnchannels(), - rate=wf.getframerate(), - output=True - ) - - print(f"Playing audio: {AUDIO_FILE_PATH}") - - # Play audio in chunks - chunk_size = 1024 - data = wf.readframes(chunk_size) - - while data: - stream.write(data) - data = wf.readframes(chunk_size) - - stream.stop_stream() - stream.close() - p.terminate() - - print("Audio playback finished") - - except Exception as e: - print(f"Error playing audio: {e}") - - - def transcribe_multichannel(): - # Create transcribers for each channel - transcriber_1 = ChannelTranscriber(0, "Speaker 1") - transcriber_2 = ChannelTranscriber(1, "Speaker 2") - - # Start audio playback - audio_thread = threading.Thread(target=play_audio_file, daemon=True) - audio_thread.start() - - # Start both transcriptions - thread_1 = transcriber_1.start_transcription() - thread_2 = transcriber_2.start_transcription() - - # Wait for completion - thread_1.join() - thread_2.join() - audio_thread.join() - - if __name__ == "__main__": - transcribe_multichannel() - ``` - +```python +import logging +from typing import Type +import threading +import time +import wave +import numpy as np +import pyaudio + +import assemblyai as aai +from assemblyai.streaming.v3 import ( + BeginEvent, + StreamingClient, + StreamingClientOptions, + StreamingError, + StreamingEvents, + StreamingParameters, + TerminationEvent, + TurnEvent, +) + +# Configuration +API_KEY = "" +AUDIO_FILE_PATH = "" + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + + +class ChannelTranscriber: + def __init__(self, channel_id, channel_name, sample_rate): + self.channel_id = channel_id + self.channel_name = channel_name + self.sample_rate = sample_rate + self.client = None + self.audio_data = [] + self.current_turn_line = None + self.line_count = 0 + self.streaming_done = threading.Event() + + def load_audio_channel(self): + """Extract single channel from dual-channel audio file.""" + with wave.open(AUDIO_FILE_PATH, 'rb') as wf: + frames = wf.readframes(wf.getnframes()) + audio_array = np.frombuffer(frames, dtype=np.int16) + + if wf.getnchannels() == 2: + audio_array = audio_array.reshape(-1, 2) + channel_audio = audio_array[:, self.channel_id] + + # Split into chunks for streaming + FRAMES_PER_BUFFER = 400 # 50ms chunks + for i in range(0, len(channel_audio), FRAMES_PER_BUFFER): + chunk = channel_audio[i:i+FRAMES_PER_BUFFER] + if len(chunk) < FRAMES_PER_BUFFER: + chunk = np.pad(chunk, (0, FRAMES_PER_BUFFER - len(chunk)), 'constant') + self.audio_data.append(chunk.astype(np.int16).tobytes()) + + def clear_current_line(self): + if self.current_turn_line is not None: + print("\r" + " " * 100 + "\r", end="", flush=True) + + def print_partial_transcript(self, words): + self.clear_current_line() + # Build transcript from individual words + word_texts = [word.text for word in words] + transcript = ' '.join(word_texts) + partial_text = f"{self.channel_name}: {transcript}" + print(partial_text, end="", flush=True) + self.current_turn_line = len(partial_text) + + def print_final_transcript(self, transcript): + self.clear_current_line() + final_text = f"{self.channel_name}: {transcript}" + print(final_text, flush=True) + self.current_turn_line = None + self.line_count += 1 + + def on_begin(self, client: Type[StreamingClient], event: BeginEvent): + """Called when the streaming session begins.""" + pass # Session started + + def on_turn(self, client: Type[StreamingClient], event: TurnEvent): + """Called when a turn is received.""" + transcript = event.transcript.strip() if event.transcript else '' + formatted = event.turn_is_formatted + words = event.words if event.words else [] + + if transcript or words: + if formatted: + self.print_final_transcript(transcript) + else: + self.print_partial_transcript(words) + + def on_terminated(self, client: Type[StreamingClient], event: TerminationEvent): + """Called when the session is terminated.""" + self.clear_current_line() + self.streaming_done.set() + + def on_error(self, client: Type[StreamingClient], error: StreamingError): + """Called when an error occurs.""" + print(f"\n{self.channel_name}: Error: {error}") + self.streaming_done.set() + + def start_transcription(self): + """Start the transcription for this channel.""" + self.load_audio_channel() + + # Create streaming client + self.client = StreamingClient( + StreamingClientOptions( + api_key=API_KEY, + api_host="streaming.assemblyai.com", + ) + ) + + # Register event handlers + self.client.on(StreamingEvents.Begin, self.on_begin) + self.client.on(StreamingEvents.Turn, self.on_turn) + self.client.on(StreamingEvents.Termination, self.on_terminated) + self.client.on(StreamingEvents.Error, self.on_error) + + # Connect to streaming service with turn detection configuration + self.client.connect( + StreamingParameters( + sample_rate=self.sample_rate, + format_turns=True, + end_of_turn_confidence_threshold=0.4, + min_end_of_turn_silence_when_confident=160, + max_turn_silence=400, + ) + ) + + # Create audio generator + def audio_generator(): + for chunk in self.audio_data: + yield chunk + time.sleep(0.05) # 50ms intervals + + try: + # Stream audio + self.client.stream(audio_generator()) + finally: + # Disconnect + self.client.disconnect(terminate=True) + self.streaming_done.set() + + def start_transcription_thread(self): + """Start transcription in a separate thread.""" + thread = threading.Thread(target=self.start_transcription, daemon=True) + thread.start() + return thread + + +def play_audio_file(): + try: + with wave.open(AUDIO_FILE_PATH, 'rb') as wf: + p = pyaudio.PyAudio() + + stream = p.open( + format=p.get_format_from_width(wf.getsampwidth()), + channels=wf.getnchannels(), + rate=wf.getframerate(), + output=True + ) + + print(f"Playing audio: {AUDIO_FILE_PATH}") + + # Play audio in chunks + chunk_size = 1024 + data = wf.readframes(chunk_size) + + while data: + stream.write(data) + data = wf.readframes(chunk_size) + + stream.stop_stream() + stream.close() + p.terminate() + + print("Audio playback finished") + + except Exception as e: + print(f"Error playing audio: {e}") + + +def transcribe_multichannel(): + # Get sample rate from file + with wave.open(AUDIO_FILE_PATH, 'rb') as wf: + sample_rate = wf.getframerate() + + # Create transcribers for each channel + transcriber_1 = ChannelTranscriber(0, "Speaker 1", sample_rate) + transcriber_2 = ChannelTranscriber(1, "Speaker 2", sample_rate) + + # Start audio playback + audio_thread = threading.Thread(target=play_audio_file, daemon=True) + audio_thread.start() + + # Start both transcriptions + thread_1 = transcriber_1.start_transcription_thread() + thread_2 = transcriber_2.start_transcription_thread() + + # Wait for completion + thread_1.join() + thread_2.join() + audio_thread.join() + + +if __name__ == "__main__": + transcribe_multichannel() +``` - + + Firstly, install the required dependencies. - - ```bash - npm install ws - ``` +```bash +pip install websocket-client numpy pyaudio +``` Use this complete script to transcribe dual-channel audio with speaker separation: +```python +import websocket +import json +import threading +import numpy as np +import wave +import time +import pyaudio +from urllib.parse import urlencode + +# Configuration +YOUR_API_KEY = "" +AUDIO_FILE_PATH = "" +API_BASE_URL = "wss://streaming.assemblyai.com/v3/ws" +API_PARAMS = { + "sample_rate": 8000, + "format_turns": "true", + "end_of_turn_confidence_threshold": 0.4, + "min_end_of_turn_silence_when_confident": 160, + "max_turn_silence": 400, +} +# Build API endpoint with URL encoding +API_ENDPOINT = f"{API_BASE_URL}?{urlencode(API_PARAMS)}" + +class ChannelTranscriber: + def __init__(self, channel_id, channel_name): + self.channel_id = channel_id + self.channel_name = channel_name + self.ws_app = None + self.audio_data = [] + self.current_turn_line = None + self.line_count = 0 + + def load_audio_channel(self): + """Extract single channel from dual-channel audio file.""" + with wave.open(AUDIO_FILE_PATH, 'rb') as wf: + frames = wf.readframes(wf.getnframes()) + audio_array = np.frombuffer(frames, dtype=np.int16) + + if wf.getnchannels() == 2: + audio_array = audio_array.reshape(-1, 2) + channel_audio = audio_array[:, self.channel_id] + + # Split into chunks for streaming + FRAMES_PER_BUFFER = 400 # 50ms chunks + for i in range(0, len(channel_audio), FRAMES_PER_BUFFER): + chunk = channel_audio[i:i+FRAMES_PER_BUFFER] + if len(chunk) < FRAMES_PER_BUFFER: + chunk = np.pad(chunk, (0, FRAMES_PER_BUFFER - len(chunk)), 'constant') + self.audio_data.append(chunk.astype(np.int16).tobytes()) + + def on_open(self, ws): + """Stream audio data when connection opens.""" + def stream_audio(): + for chunk in self.audio_data: + ws.send(chunk, websocket.ABNF.OPCODE_BINARY) + time.sleep(0.05) # 50ms intervals + + # Send termination message + terminate_message = {"type": "Terminate"} + ws.send(json.dumps(terminate_message)) + + threading.Thread(target=stream_audio, daemon=True).start() + + def clear_current_line(self): + if self.current_turn_line is not None: + print("\r" + " " * 100 + "\r", end="", flush=True) + + def print_partial_transcript(self, words): + self.clear_current_line() + # Build transcript from individual words + word_texts = [word.get('text', '') for word in words] + transcript = ' '.join(word_texts) + partial_text = f"{self.channel_name}: {transcript}" + print(partial_text, end="", flush=True) + self.current_turn_line = len(partial_text) + + def print_final_transcript(self, transcript): + self.clear_current_line() + final_text = f"{self.channel_name}: {transcript}" + print(final_text, flush=True) + self.current_turn_line = None + self.line_count += 1 + + def on_message(self, ws, message): + """Handle transcription results.""" + data = json.loads(message) + msg_type = data.get('type') + + if msg_type == "Turn": + transcript = data.get('transcript', '').strip() + formatted = data.get('turn_is_formatted', False) + words = data.get('words', []) + + if transcript or words: + if formatted: + self.print_final_transcript(transcript) + else: + self.print_partial_transcript(words) + + def start_transcription(self): + self.load_audio_channel() + + self.ws_app = websocket.WebSocketApp( + API_ENDPOINT, + header={"Authorization": YOUR_API_KEY}, + on_open=self.on_open, + on_message=self.on_message, + ) + + thread = threading.Thread(target=self.ws_app.run_forever, daemon=True) + thread.start() + return thread + +def play_audio_file(): + try: + with wave.open(AUDIO_FILE_PATH, 'rb') as wf: + p = pyaudio.PyAudio() + + stream = p.open( + format=p.get_format_from_width(wf.getsampwidth()), + channels=wf.getnchannels(), + rate=wf.getframerate(), + output=True + ) + + print(f"Playing audio: {AUDIO_FILE_PATH}") + + # Play audio in chunks + chunk_size = 1024 + data = wf.readframes(chunk_size) + + while data: + stream.write(data) + data = wf.readframes(chunk_size) + + stream.stop_stream() + stream.close() + p.terminate() + + print("Audio playback finished") + + except Exception as e: + print(f"Error playing audio: {e}") + + +def transcribe_multichannel(): + # Create transcribers for each channel + transcriber_1 = ChannelTranscriber(0, "Speaker 1") + transcriber_2 = ChannelTranscriber(1, "Speaker 2") + + # Start audio playback + audio_thread = threading.Thread(target=play_audio_file, daemon=True) + audio_thread.start() + + # Start both transcriptions + thread_1 = transcriber_1.start_transcription() + thread_2 = transcriber_2.start_transcription() + + # Wait for completion + thread_1.join() + thread_2.join() + audio_thread.join() + +if __name__ == "__main__": + transcribe_multichannel() +``` + - ```javascript - const WebSocket = require('ws'); - const fs = require('fs'); - const { spawn } = require('child_process'); - - // Configuration - const YOUR_API_KEY = ''; - const AUDIO_FILE_PATH = ''; - const API_BASE_URL = 'wss://streaming.assemblyai.com/v3/ws'; - const API_PARAMS = { - sample_rate: 8000, - format_turns: 'true', - }; - - // Build API endpoint with URL encoding - const queryString = new URLSearchParams(API_PARAMS).toString(); - const API_ENDPOINT = `${API_BASE_URL}?${queryString}`; - - // Simple WAV file parser - class SimpleWavParser { - constructor(filePath) { - this.buffer = fs.readFileSync(filePath); - this.parseHeader(); - } - - parseHeader() { - // Read WAV header - this.channels = this.buffer.readUInt16LE(22); - this.sampleRate = this.buffer.readUInt32LE(24); - this.bitsPerSample = this.buffer.readUInt16LE(34); - - // Find data chunk - let dataOffset = 12; - while (dataOffset < this.buffer.length - 8) { - const chunkId = this.buffer.toString('ascii', dataOffset, dataOffset + 4); - const chunkSize = this.buffer.readUInt32LE(dataOffset + 4); - - if (chunkId === 'data') { - this.dataStart = dataOffset + 8; - this.dataSize = chunkSize; - break; - } - - dataOffset += 8 + chunkSize; - } - } + + + - getChannelData(channelIndex) { - if (this.channels !== 2) { - throw new Error('Audio file is not stereo'); - } + + + - const bytesPerSample = this.bitsPerSample / 8; - const samplesPerChannel = this.dataSize / (bytesPerSample * this.channels); - const channelData = []; + Firstly, install the required dependencies. +```bash +npm install assemblyai +``` - // Extract samples for the specified channel - for (let i = 0; i < samplesPerChannel; i++) { - const sampleOffset = this.dataStart + (i * this.channels + channelIndex) * bytesPerSample; - - if (this.bitsPerSample === 16) { - const sample = this.buffer.readInt16LE(sampleOffset); - channelData.push(sample); - } else if (this.bitsPerSample === 8) { - const sample = this.buffer.readUInt8(sampleOffset) - 128; - channelData.push(sample * 256); // Convert to 16-bit range - } + + + Use this complete script to transcribe dual-channel audio with speaker separation: +```javascript +import { AssemblyAI } from 'assemblyai'; +import fs from 'fs'; +import { spawn } from 'child_process'; +import { Readable } from 'stream'; + +// Configuration +const YOUR_API_KEY = ''; +const AUDIO_FILE_PATH = ''; + +// Simple WAV file parser +class SimpleWavParser { + constructor(filePath) { + this.buffer = fs.readFileSync(filePath); + this.parseHeader(); + } + + parseHeader() { + // Read WAV header + this.channels = this.buffer.readUInt16LE(22); + this.sampleRate = this.buffer.readUInt32LE(24); + this.bitsPerSample = this.buffer.readUInt16LE(34); + + // Find data chunk + let dataOffset = 12; + while (dataOffset < this.buffer.length - 8) { + const chunkId = this.buffer.toString('ascii', dataOffset, dataOffset + 4); + const chunkSize = this.buffer.readUInt32LE(dataOffset + 4); + + if (chunkId === 'data') { + this.dataStart = dataOffset + 8; + this.dataSize = chunkSize; + break; + } + + dataOffset += 8 + chunkSize; + } + } + + getChannelData(channelIndex) { + if (this.channels !== 2) { + throw new Error('Audio file is not stereo'); + } + + const bytesPerSample = this.bitsPerSample / 8; + const samplesPerChannel = this.dataSize / (bytesPerSample * this.channels); + const channelData = []; + + // Extract samples for the specified channel + for (let i = 0; i < samplesPerChannel; i++) { + const sampleOffset = this.dataStart + (i * this.channels + channelIndex) * bytesPerSample; + + if (this.bitsPerSample === 16) { + const sample = this.buffer.readInt16LE(sampleOffset); + channelData.push(sample); + } else if (this.bitsPerSample === 8) { + const sample = this.buffer.readUInt8(sampleOffset) - 128; + channelData.push(sample * 256); // Convert to 16-bit range + } + } + + return channelData; + } +} + +class ChannelTranscriber { + constructor(client, channelId, channelName, sampleRate) { + this.client = client; + this.channelId = channelId; + this.channelName = channelName; + this.sampleRate = sampleRate; + this.transcriber = null; + this.audioData = []; + this.currentTurnLine = null; + this.lineCount = 0; + } + + loadAudioChannel() { + try { + const wavParser = new SimpleWavParser(AUDIO_FILE_PATH); + const channelSamples = wavParser.getChannelData(this.channelId); + + // Split into chunks for streaming (50ms chunks) + const FRAMES_PER_BUFFER = Math.floor(this.sampleRate * 0.05); // 50ms + + for (let i = 0; i < channelSamples.length; i += FRAMES_PER_BUFFER) { + const chunkArray = new Int16Array(FRAMES_PER_BUFFER); + + // Copy samples and pad if necessary + for (let j = 0; j < FRAMES_PER_BUFFER; j++) { + if (i + j < channelSamples.length) { + chunkArray[j] = channelSamples[i + j]; + } else { + chunkArray[j] = 0; // Pad with silence } - - return channelData; } + + // Convert to Buffer (Little Endian) + const buffer = Buffer.from(chunkArray.buffer); + this.audioData.push(buffer); } + } catch (error) { + throw error; + } + } + + clearCurrentLine() { + if (this.currentTurnLine !== null) { + process.stdout.write('\r' + ' '.repeat(100) + '\r'); + } + } + + printPartialTranscript(words) { + this.clearCurrentLine(); + // Build transcript from individual words + const wordTexts = words.map(word => word.text || ''); + const transcript = wordTexts.join(' '); + const partialText = `${this.channelName}: ${transcript}`; + process.stdout.write(partialText); + this.currentTurnLine = partialText.length; + } + + printFinalTranscript(transcript) { + this.clearCurrentLine(); + const finalText = `${this.channelName}: ${transcript}`; + console.log(finalText); + this.currentTurnLine = null; + this.lineCount++; + } + + async startTranscription() { + try { + this.loadAudioChannel(); + } catch (error) { + throw error; + } + + const turnDetectionConfig = { + endOfTurnConfidenceThreshold: 0.4, + minEndOfTurnSilenceWhenConfident: 160, + maxTurnSilence: 400 + }; + + // Create transcriber with SDK + this.transcriber = this.client.streaming.transcriber({ + sampleRate: this.sampleRate, + formatTurns: true, + ...turnDetectionConfig + }); + + // Set up event handlers + this.transcriber.on('open', ({ id }) => { + // Session opened + }); + + this.transcriber.on('error', (error) => { + console.error(`\n${this.channelName}: Error:`, error); + }); + + this.transcriber.on('close', (code, reason) => { + this.clearCurrentLine(); + if (code !== 1000 && code !== 1001) { + console.log(`\n${this.channelName}: Connection closed unexpectedly`); + } + }); - class ChannelTranscriber { - constructor(channelId, channelName) { - this.channelId = channelId; - this.channelName = channelName; - this.ws = null; - this.audioData = []; - this.currentTurnLine = null; - this.lineCount = 0; - this.isConnected = false; + this.transcriber.on('turn', (turn) => { + const transcript = (turn.transcript || '').trim(); + const formatted = turn.turn_is_formatted || false; + const words = turn.words || []; + + if (transcript || words.length > 0) { + if (formatted) { + this.printFinalTranscript(transcript); + } else { + this.printPartialTranscript(words); } + } + }); - loadAudioChannel() { - try { - const wavParser = new SimpleWavParser(AUDIO_FILE_PATH); - const channelSamples = wavParser.getChannelData(this.channelId); - - // Split into chunks for streaming (50ms chunks at 8000Hz = 400 samples) - const FRAMES_PER_BUFFER = 400; - - for (let i = 0; i < channelSamples.length; i += FRAMES_PER_BUFFER) { - const chunkArray = new Int16Array(FRAMES_PER_BUFFER); - - // Copy samples and pad if necessary - for (let j = 0; j < FRAMES_PER_BUFFER; j++) { - if (i + j < channelSamples.length) { - chunkArray[j] = channelSamples[i + j]; - } else { - chunkArray[j] = 0; // Pad with silence - } - } - - // Convert to Buffer (Little Endian) - const buffer = Buffer.from(chunkArray.buffer); - this.audioData.push(buffer); - } - } catch (error) { - throw error; - } - } + // Connect to the streaming service + await this.transcriber.connect(); - clearCurrentLine() { - if (this.currentTurnLine !== null) { - process.stdout.write('\r' + ' '.repeat(100) + '\r'); - } + // Create a readable stream from audio chunks + const audioStream = new Readable({ + async read() { + // This will be controlled by our manual push below + } + }); + + // Pipe audio stream to transcriber + Readable.toWeb(audioStream).pipeTo(this.transcriber.stream()); + + // Stream audio data + for (const chunk of this.audioData) { + audioStream.push(chunk); + await new Promise(resolve => setTimeout(resolve, 50)); // 50ms intervals + } + + // Signal end of stream + audioStream.push(null); + + // Wait a bit for final transcripts + await new Promise(resolve => setTimeout(resolve, 1000)); + + // Close the transcriber + await this.transcriber.close(); + } + + async close() { + if (this.transcriber) { + await this.transcriber.close(); + } + } +} + +function playAudioFile() { + return new Promise((resolve) => { + console.log(`Playing audio: ${AUDIO_FILE_PATH}`); + + // Use platform-specific audio player + let command; + let args; + + if (process.platform === 'darwin') { + // macOS + command = 'afplay'; + args = [AUDIO_FILE_PATH]; + } else if (process.platform === 'win32') { + // Windows - using PowerShell + command = 'powershell'; + args = ['-c', `(New-Object Media.SoundPlayer '${AUDIO_FILE_PATH}').PlaySync()`]; + } else { + // Linux - try aplay + command = 'aplay'; + args = [AUDIO_FILE_PATH]; + } + + try { + const player = spawn(command, args, { + stdio: ['ignore', 'ignore', 'ignore'] // Suppress all output from player + }); + + player.on('close', (code) => { + if (code === 0) { + console.log('Audio playback finished'); } + resolve(); + }); + + player.on('error', (error) => { + // Silently continue without audio + resolve(); + }); + } catch (error) { + resolve(); + } + }); +} + +async function transcribeMultichannel() { + // Verify API key is set + if (YOUR_API_KEY === '') { + console.error('ERROR: Please set YOUR_API_KEY before running'); + process.exit(1); + } + + // Verify file exists + if (!fs.existsSync(AUDIO_FILE_PATH)) { + console.error(`ERROR: Audio file not found: ${AUDIO_FILE_PATH}`); + process.exit(1); + } + + // Get sample rate from file + const wavParser = new SimpleWavParser(AUDIO_FILE_PATH); + const sampleRate = wavParser.sampleRate; + + // Create SDK client + const client = new AssemblyAI({ + apiKey: YOUR_API_KEY + }); + + const transcriber1 = new ChannelTranscriber(client, 0, 'Speaker 1', sampleRate); + const transcriber2 = new ChannelTranscriber(client, 1, 'Speaker 2', sampleRate); + + try { + // Start audio playback (non-blocking) + const audioPromise = playAudioFile(); + + // Start both transcriptions + const transcriptionPromises = [ + transcriber1.startTranscription(), + transcriber2.startTranscription() + ]; + + // Wait for all to complete + await Promise.all([...transcriptionPromises, audioPromise]); + + } catch (error) { + console.error('\nError during transcription:', error.message); + + // Clean up + await transcriber1.close(); + await transcriber2.close(); + + process.exit(1); + } +} + +// Handle graceful shutdown +process.on('SIGINT', () => { + console.log('\n'); // Clean line break before exit + process.exit(0); +}); + +// Main execution +transcribeMultichannel(); +``` - printPartialTranscript(words) { - this.clearCurrentLine(); - // Build transcript from individual words - const wordTexts = words.map(word => word.text || ''); - const transcript = wordTexts.join(' '); - const partialText = `${this.channelName}: ${transcript}`; - process.stdout.write(partialText); - this.currentTurnLine = partialText.length; - } + + + - printFinalTranscript(transcript) { - this.clearCurrentLine(); - const finalText = `${this.channelName}: ${transcript}`; - console.log(finalText); - this.currentTurnLine = null; - this.lineCount++; - } + - async streamAudio() { - // Wait a bit for connection to stabilize - await new Promise(resolve => setTimeout(resolve, 100)); - - for (const chunk of this.audioData) { - if (this.ws.readyState === WebSocket.OPEN) { - this.ws.send(chunk, { binary: true }); - await new Promise(resolve => setTimeout(resolve, 50)); // 50ms intervals - } else { - break; - } - } - - // Send termination message - if (this.ws.readyState === WebSocket.OPEN) { - const terminateMessage = { type: 'Terminate' }; - this.ws.send(JSON.stringify(terminateMessage)); - } - } + + - startTranscription() { - return new Promise((resolve, reject) => { - try { - this.loadAudioChannel(); - } catch (error) { - reject(error); - return; - } - - this.ws = new WebSocket(API_ENDPOINT, { - headers: { - Authorization: YOUR_API_KEY - } - }); - - this.ws.on('open', () => { - this.isConnected = true; - // Start streaming audio - this.streamAudio().catch(error => {}); - }); - - this.ws.on('message', (data) => { - try { - const message = JSON.parse(data.toString()); - const msgType = message.type; - - if (msgType === 'Turn') { - const transcript = (message.transcript || '').trim(); - const formatted = message.turn_is_formatted || false; - const words = message.words || []; - - if (transcript || words.length > 0) { - if (formatted) { - this.printFinalTranscript(transcript); - } else { - this.printPartialTranscript(words); - } - } - } else if (msgType === 'error') { - console.error(`\n${this.channelName}: API Error:`, message.error); - } - } catch (error) { - // Silently ignore parse errors - } - }); - - this.ws.on('close', (code, reason) => { - this.clearCurrentLine(); - if (code !== 1000 && code !== 1001) { - console.log(`\n${this.channelName}: Connection closed unexpectedly`); - } - this.isConnected = false; - resolve(); - }); - - this.ws.on('error', (error) => { - console.error(`\n${this.channelName} WebSocket error:`, error.message); - this.isConnected = false; - reject(error); - }); - }); - } + Firstly, install the required dependencies. +```bash +npm install ws +``` - close() { - if (this.ws && this.isConnected) { - this.ws.close(); - } - } + + + Use this complete script to transcribe dual-channel audio with speaker separation: +```javascript +const WebSocket = require('ws'); +const fs = require('fs'); +const { spawn } = require('child_process'); + +// Configuration +const YOUR_API_KEY = ''; +const AUDIO_FILE_PATH = ''; +const API_BASE_URL = 'wss://streaming.assemblyai.com/v3/ws'; +const API_PARAMS = { + sample_rate: 8000, + format_turns: 'true', + end_of_turn_confidence_threshold: 0.4, + min_end_of_turn_silence_when_confident: 160, + max_turn_silence: 400, +}; + +// Build API endpoint with URL encoding +const queryString = new URLSearchParams(API_PARAMS).toString(); +const API_ENDPOINT = `${API_BASE_URL}?${queryString}`; + +// Simple WAV file parser +class SimpleWavParser { + constructor(filePath) { + this.buffer = fs.readFileSync(filePath); + this.parseHeader(); + } + + parseHeader() { + // Read WAV header + this.channels = this.buffer.readUInt16LE(22); + this.sampleRate = this.buffer.readUInt32LE(24); + this.bitsPerSample = this.buffer.readUInt16LE(34); + + // Find data chunk + let dataOffset = 12; + while (dataOffset < this.buffer.length - 8) { + const chunkId = this.buffer.toString('ascii', dataOffset, dataOffset + 4); + const chunkSize = this.buffer.readUInt32LE(dataOffset + 4); + + if (chunkId === 'data') { + this.dataStart = dataOffset + 8; + this.dataSize = chunkSize; + break; } - - function playAudioFile() { - return new Promise((resolve) => { - console.log(`Playing audio: ${AUDIO_FILE_PATH}`); - - // Use platform-specific audio player - let command; - let args; - - if (process.platform === 'darwin') { - // macOS - command = 'afplay'; - args = [AUDIO_FILE_PATH]; - } else if (process.platform === 'win32') { - // Windows - using PowerShell - command = 'powershell'; - args = ['-c', `(New-Object Media.SoundPlayer '${AUDIO_FILE_PATH}').PlaySync()`]; + + dataOffset += 8 + chunkSize; + } + } + + getChannelData(channelIndex) { + if (this.channels !== 2) { + throw new Error('Audio file is not stereo'); + } + + const bytesPerSample = this.bitsPerSample / 8; + const samplesPerChannel = this.dataSize / (bytesPerSample * this.channels); + const channelData = []; + + // Extract samples for the specified channel + for (let i = 0; i < samplesPerChannel; i++) { + const sampleOffset = this.dataStart + (i * this.channels + channelIndex) * bytesPerSample; + + if (this.bitsPerSample === 16) { + const sample = this.buffer.readInt16LE(sampleOffset); + channelData.push(sample); + } else if (this.bitsPerSample === 8) { + const sample = this.buffer.readUInt8(sampleOffset) - 128; + channelData.push(sample * 256); // Convert to 16-bit range + } + } + + return channelData; + } +} + +class ChannelTranscriber { + constructor(channelId, channelName) { + this.channelId = channelId; + this.channelName = channelName; + this.ws = null; + this.audioData = []; + this.currentTurnLine = null; + this.lineCount = 0; + this.isConnected = false; + } + + loadAudioChannel() { + try { + const wavParser = new SimpleWavParser(AUDIO_FILE_PATH); + const channelSamples = wavParser.getChannelData(this.channelId); + + // Split into chunks for streaming (50ms chunks at 8000Hz = 400 samples) + const FRAMES_PER_BUFFER = 400; + + for (let i = 0; i < channelSamples.length; i += FRAMES_PER_BUFFER) { + const chunkArray = new Int16Array(FRAMES_PER_BUFFER); + + // Copy samples and pad if necessary + for (let j = 0; j < FRAMES_PER_BUFFER; j++) { + if (i + j < channelSamples.length) { + chunkArray[j] = channelSamples[i + j]; } else { - // Linux - try aplay - command = 'aplay'; - args = [AUDIO_FILE_PATH]; - } - - try { - const player = spawn(command, args, { - stdio: ['ignore', 'ignore', 'ignore'] // Suppress all output from player - }); - - player.on('close', (code) => { - if (code === 0) { - console.log('Audio playback finished'); - } - resolve(); - }); - - player.on('error', (error) => { - // Silently continue without audio - resolve(); - }); - } catch (error) { - resolve(); + chunkArray[j] = 0; // Pad with silence } - }); - } - - async function transcribeMultichannel() { - const transcriber1 = new ChannelTranscriber(0, 'Speaker 1'); - const transcriber2 = new ChannelTranscriber(1, 'Speaker 2'); + } + // Convert to Buffer (Little Endian) + const buffer = Buffer.from(chunkArray.buffer); + this.audioData.push(buffer); + } + } catch (error) { + throw error; + } + } + + clearCurrentLine() { + if (this.currentTurnLine !== null) { + process.stdout.write('\r' + ' '.repeat(100) + '\r'); + } + } + + printPartialTranscript(words) { + this.clearCurrentLine(); + // Build transcript from individual words + const wordTexts = words.map(word => word.text || ''); + const transcript = wordTexts.join(' '); + const partialText = `${this.channelName}: ${transcript}`; + process.stdout.write(partialText); + this.currentTurnLine = partialText.length; + } + + printFinalTranscript(transcript) { + this.clearCurrentLine(); + const finalText = `${this.channelName}: ${transcript}`; + console.log(finalText); + this.currentTurnLine = null; + this.lineCount++; + } + + async streamAudio() { + // Wait a bit for connection to stabilize + await new Promise(resolve => setTimeout(resolve, 100)); + + for (const chunk of this.audioData) { + if (this.ws.readyState === WebSocket.OPEN) { + this.ws.send(chunk, { binary: true }); + await new Promise(resolve => setTimeout(resolve, 50)); // 50ms intervals + } else { + break; + } + } + + // Send termination message + if (this.ws.readyState === WebSocket.OPEN) { + const terminateMessage = { type: 'Terminate' }; + this.ws.send(JSON.stringify(terminateMessage)); + } + } + + startTranscription() { + return new Promise((resolve, reject) => { + try { + this.loadAudioChannel(); + } catch (error) { + reject(error); + return; + } + + this.ws = new WebSocket(API_ENDPOINT, { + headers: { + Authorization: YOUR_API_KEY + } + }); + + this.ws.on('open', () => { + this.isConnected = true; + // Start streaming audio + this.streamAudio().catch(error => {}); + }); + + this.ws.on('message', (data) => { try { - // Verify API key is set - if (YOUR_API_KEY === '') { - console.error('ERROR: Please set YOUR_API_KEY before running'); - process.exit(1); - } + const message = JSON.parse(data.toString()); + const msgType = message.type; - // Verify file exists - if (!fs.existsSync(AUDIO_FILE_PATH)) { - console.error(`ERROR: Audio file not found: ${AUDIO_FILE_PATH}`); - process.exit(1); + if (msgType === 'Turn') { + const transcript = (message.transcript || '').trim(); + const formatted = message.turn_is_formatted || false; + const words = message.words || []; + + if (transcript || words.length > 0) { + if (formatted) { + this.printFinalTranscript(transcript); + } else { + this.printPartialTranscript(words); + } + } + } else if (msgType === 'error') { + console.error(`\n${this.channelName}: API Error:`, message.error); } - - // Start audio playback (non-blocking) - const audioPromise = playAudioFile(); - - // Start both transcriptions - const transcriptionPromises = [ - transcriber1.startTranscription(), - transcriber2.startTranscription() - ]; - - // Wait for all to complete - await Promise.all([...transcriptionPromises, audioPromise]); - } catch (error) { - console.error('\nError during transcription:', error.message); - - // Clean up - transcriber1.close(); - transcriber2.close(); - - process.exit(1); + // Silently ignore parse errors } - } - - // Handle graceful shutdown - process.on('SIGINT', () => { - console.log('\n'); // Clean line break before exit - process.exit(0); }); + + this.ws.on('close', (code, reason) => { + this.clearCurrentLine(); + if (code !== 1000 && code !== 1001) { + console.log(`\n${this.channelName}: Connection closed unexpectedly`); + } + this.isConnected = false; + resolve(); + }); + + this.ws.on('error', (error) => { + console.error(`\n${this.channelName} WebSocket error:`, error.message); + this.isConnected = false; + reject(error); + }); + }); + } + + close() { + if (this.ws && this.isConnected) { + this.ws.close(); + } + } +} + +function playAudioFile() { + return new Promise((resolve) => { + console.log(`Playing audio: ${AUDIO_FILE_PATH}`); + + // Use platform-specific audio player + let command; + let args; + + if (process.platform === 'darwin') { + // macOS + command = 'afplay'; + args = [AUDIO_FILE_PATH]; + } else if (process.platform === 'win32') { + // Windows - using PowerShell + command = 'powershell'; + args = ['-c', `(New-Object Media.SoundPlayer '${AUDIO_FILE_PATH}').PlaySync()`]; + } else { + // Linux - try aplay + command = 'aplay'; + args = [AUDIO_FILE_PATH]; + } + + try { + const player = spawn(command, args, { + stdio: ['ignore', 'ignore', 'ignore'] // Suppress all output from player + }); + + player.on('close', (code) => { + if (code === 0) { + console.log('Audio playback finished'); + } + resolve(); + }); + + player.on('error', (error) => { + // Silently continue without audio + resolve(); + }); + } catch (error) { + resolve(); + } + }); +} + +async function transcribeMultichannel() { + const transcriber1 = new ChannelTranscriber(0, 'Speaker 1'); + const transcriber2 = new ChannelTranscriber(1, 'Speaker 2'); + + try { + // Verify API key is set + if (YOUR_API_KEY === '') { + console.error('ERROR: Please set YOUR_API_KEY before running'); + process.exit(1); + } + + // Verify file exists + if (!fs.existsSync(AUDIO_FILE_PATH)) { + console.error(`ERROR: Audio file not found: ${AUDIO_FILE_PATH}`); + process.exit(1); + } + + // Start audio playback (non-blocking) + const audioPromise = playAudioFile(); + + // Start both transcriptions + const transcriptionPromises = [ + transcriber1.startTranscription(), + transcriber2.startTranscription() + ]; + + // Wait for all to complete + await Promise.all([...transcriptionPromises, audioPromise]); + + } catch (error) { + console.error('\nError during transcription:', error.message); + + // Clean up + transcriber1.close(); + transcriber2.close(); + + process.exit(1); + } +} + +// Handle graceful shutdown +process.on('SIGINT', () => { + console.log('\n'); // Clean line break before exit + process.exit(0); +}); + +// Main execution +if (require.main === module) { + transcribeMultichannel(); +} +``` - // Main execution - if (require.main === module) { - transcribeMultichannel(); - } - - ``` @@ -535,3 +1101,70 @@ The following code example demonstrates how to transcribe a dual-channel audio f + +The examples above use turn detection settings optimized for short responses and rapid back-and-forth conversations. To optimize for your specific audio scenario, you can adjust the turn detection parameters. + +For configuration examples tailored to different use cases, refer to our [Configuration examples](/docs/universal-streaming/turn-detection#quick-start-configurations). + + + +Modify the `StreamingParameters` in the `start_transcription` method: +```python +# Connect to streaming service with turn detection configuration +self.client.connect( + StreamingParameters( + sample_rate=self.sample_rate, + format_turns=True, + end_of_turn_confidence_threshold=0.4, + min_end_of_turn_silence_when_confident=160, + max_turn_silence=400, + ) +) +``` + + + +Modify the turn detection parameters in `API_PARAMS`: +```python +API_PARAMS = { + "sample_rate": 8000, + "format_turns": "true", + "end_of_turn_confidence_threshold": 0.4, + "min_end_of_turn_silence_when_confident": 160, + "max_turn_silence": 400, +} +``` + + + +Modify the turn detection configuration object: +```javascript +const turnDetectionConfig = { + endOfTurnConfidenceThreshold: 0.4, + minEndOfTurnSilenceWhenConfident: 160, + maxTurnSilence: 400 +}; + +// Create transcriber with SDK +this.transcriber = this.client.streaming.transcriber({ + sampleRate: this.sampleRate, + formatTurns: true, + ...turnDetectionConfig +}); +``` + + + +Modify the turn detection parameters in `API_PARAMS`: +```javascript +const API_PARAMS = { + sample_rate: 8000, + format_turns: 'true', + end_of_turn_confidence_threshold: 0.4, + min_end_of_turn_silence_when_confident: 160, + max_turn_silence: 400, +}; +``` + + +